PyPI - keras-nightly - Versions diffs - 3.12.0.dev2025083103__py3-none-any.whl → 3.14.0.dev2026011604__py3-none-any.whl - Mend

keras-nightly 3.12.0.dev2025083103py3-none-any.whl → 3.14.0.dev2026011604py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

keras/__init__.py +1 -0
keras/_tf_keras/keras/__init__.py +1 -0
keras/_tf_keras/keras/callbacks/__init__.py +3 -0
keras/_tf_keras/keras/distillation/__init__.py +16 -0
keras/_tf_keras/keras/distribution/__init__.py +3 -0
keras/_tf_keras/keras/dtype_policies/__init__.py +6 -0
keras/_tf_keras/keras/layers/__init__.py +21 -0
keras/_tf_keras/keras/ops/__init__.py +16 -0
keras/_tf_keras/keras/ops/image/__init__.py +1 -0
keras/_tf_keras/keras/ops/linalg/__init__.py +1 -0
keras/_tf_keras/keras/ops/nn/__init__.py +3 -0
keras/_tf_keras/keras/ops/numpy/__init__.py +12 -0
keras/_tf_keras/keras/quantizers/__init__.py +13 -0
keras/callbacks/__init__.py +3 -0
keras/distillation/__init__.py +16 -0
keras/distribution/__init__.py +3 -0
keras/dtype_policies/__init__.py +6 -0
keras/layers/__init__.py +21 -0
keras/ops/__init__.py +16 -0
keras/ops/image/__init__.py +1 -0
keras/ops/linalg/__init__.py +1 -0
keras/ops/nn/__init__.py +3 -0
keras/ops/numpy/__init__.py +12 -0
keras/quantizers/__init__.py +13 -0
keras/src/applications/imagenet_utils.py +4 -1
keras/src/backend/common/backend_utils.py +30 -6
keras/src/backend/common/dtypes.py +6 -12
keras/src/backend/common/name_scope.py +2 -1
keras/src/backend/common/variables.py +38 -20
keras/src/backend/jax/core.py +126 -78
keras/src/backend/jax/distribution_lib.py +16 -2
keras/src/backend/jax/layer.py +3 -1
keras/src/backend/jax/linalg.py +4 -0
keras/src/backend/jax/nn.py +511 -29
keras/src/backend/jax/numpy.py +109 -23
keras/src/backend/jax/optimizer.py +3 -2
keras/src/backend/jax/trainer.py +18 -3
keras/src/backend/numpy/linalg.py +4 -0
keras/src/backend/numpy/nn.py +313 -2
keras/src/backend/numpy/numpy.py +97 -8
keras/src/backend/openvino/__init__.py +1 -0
keras/src/backend/openvino/core.py +6 -23
keras/src/backend/openvino/linalg.py +4 -0
keras/src/backend/openvino/nn.py +271 -20
keras/src/backend/openvino/numpy.py +1369 -195
keras/src/backend/openvino/random.py +7 -14
keras/src/backend/tensorflow/layer.py +43 -9
keras/src/backend/tensorflow/linalg.py +24 -0
keras/src/backend/tensorflow/nn.py +545 -1
keras/src/backend/tensorflow/numpy.py +351 -56
keras/src/backend/tensorflow/trainer.py +6 -2
keras/src/backend/torch/core.py +3 -1
keras/src/backend/torch/linalg.py +4 -0
keras/src/backend/torch/nn.py +125 -0
keras/src/backend/torch/numpy.py +109 -9
keras/src/backend/torch/trainer.py +8 -2
keras/src/callbacks/__init__.py +1 -0
keras/src/callbacks/callback_list.py +45 -11
keras/src/callbacks/model_checkpoint.py +5 -0
keras/src/callbacks/orbax_checkpoint.py +332 -0
keras/src/callbacks/terminate_on_nan.py +54 -5
keras/src/datasets/cifar10.py +5 -0
keras/src/distillation/__init__.py +1 -0
keras/src/distillation/distillation_loss.py +390 -0
keras/src/distillation/distiller.py +598 -0
keras/src/distribution/distribution_lib.py +14 -0
keras/src/dtype_policies/__init__.py +4 -0
keras/src/dtype_policies/dtype_policy.py +180 -1
keras/src/export/__init__.py +2 -0
keras/src/export/export_utils.py +39 -2
keras/src/export/litert.py +248 -0
keras/src/export/onnx.py +6 -0
keras/src/export/openvino.py +1 -1
keras/src/export/tf2onnx_lib.py +3 -0
keras/src/layers/__init__.py +13 -0
keras/src/layers/activations/softmax.py +9 -4
keras/src/layers/attention/attention.py +1 -1
keras/src/layers/attention/multi_head_attention.py +4 -1
keras/src/layers/core/dense.py +406 -102
keras/src/layers/core/einsum_dense.py +521 -116
keras/src/layers/core/embedding.py +257 -99
keras/src/layers/core/input_layer.py +1 -0
keras/src/layers/core/reversible_embedding.py +399 -0
keras/src/layers/input_spec.py +17 -17
keras/src/layers/layer.py +50 -15
keras/src/layers/merging/concatenate.py +6 -5
keras/src/layers/merging/dot.py +4 -1
keras/src/layers/pooling/adaptive_average_pooling1d.py +65 -0
keras/src/layers/pooling/adaptive_average_pooling2d.py +62 -0
keras/src/layers/pooling/adaptive_average_pooling3d.py +63 -0
keras/src/layers/pooling/adaptive_max_pooling1d.py +65 -0
keras/src/layers/pooling/adaptive_max_pooling2d.py +62 -0
keras/src/layers/pooling/adaptive_max_pooling3d.py +63 -0
keras/src/layers/pooling/base_adaptive_pooling.py +63 -0
keras/src/layers/preprocessing/discretization.py +6 -5
keras/src/layers/preprocessing/feature_space.py +8 -4
keras/src/layers/preprocessing/image_preprocessing/aug_mix.py +2 -2
keras/src/layers/preprocessing/image_preprocessing/bounding_boxes/validation.py +5 -5
keras/src/layers/preprocessing/image_preprocessing/random_contrast.py +3 -3
keras/src/layers/preprocessing/image_preprocessing/resizing.py +10 -0
keras/src/layers/preprocessing/index_lookup.py +19 -1
keras/src/layers/preprocessing/normalization.py +16 -1
keras/src/layers/preprocessing/string_lookup.py +26 -28
keras/src/layers/regularization/dropout.py +43 -1
keras/src/layers/rnn/gru.py +1 -1
keras/src/layers/rnn/lstm.py +2 -2
keras/src/layers/rnn/rnn.py +19 -0
keras/src/layers/rnn/simple_rnn.py +1 -1
keras/src/legacy/preprocessing/image.py +4 -1
keras/src/legacy/preprocessing/sequence.py +20 -12
keras/src/losses/loss.py +1 -1
keras/src/losses/losses.py +24 -0
keras/src/metrics/confusion_metrics.py +7 -6
keras/src/models/cloning.py +4 -0
keras/src/models/functional.py +11 -3
keras/src/models/model.py +195 -44
keras/src/ops/image.py +257 -20
keras/src/ops/linalg.py +93 -0
keras/src/ops/nn.py +268 -2
keras/src/ops/numpy.py +701 -44
keras/src/ops/operation.py +90 -29
keras/src/ops/operation_utils.py +2 -0
keras/src/optimizers/adafactor.py +29 -10
keras/src/optimizers/base_optimizer.py +22 -3
keras/src/optimizers/loss_scale_optimizer.py +51 -18
keras/src/optimizers/muon.py +65 -31
keras/src/optimizers/schedules/learning_rate_schedule.py +4 -3
keras/src/quantizers/__init__.py +14 -1
keras/src/quantizers/awq.py +361 -0
keras/src/quantizers/awq_config.py +140 -0
keras/src/quantizers/awq_core.py +217 -0
keras/src/quantizers/gptq.py +346 -207
keras/src/quantizers/gptq_config.py +63 -13
keras/src/quantizers/gptq_core.py +328 -215
keras/src/quantizers/quantization_config.py +246 -0
keras/src/quantizers/quantizers.py +407 -38
keras/src/quantizers/utils.py +23 -0
keras/src/random/seed_generator.py +6 -4
keras/src/saving/file_editor.py +81 -6
keras/src/saving/orbax_util.py +26 -0
keras/src/saving/saving_api.py +37 -14
keras/src/saving/saving_lib.py +1 -1
keras/src/testing/__init__.py +1 -0
keras/src/testing/test_case.py +45 -5
keras/src/trainers/compile_utils.py +38 -17
keras/src/trainers/data_adapters/grain_dataset_adapter.py +1 -5
keras/src/tree/torchtree_impl.py +215 -0
keras/src/tree/tree_api.py +6 -1
keras/src/utils/backend_utils.py +31 -4
keras/src/utils/dataset_utils.py +234 -35
keras/src/utils/file_utils.py +49 -11
keras/src/utils/image_utils.py +14 -2
keras/src/utils/jax_layer.py +244 -55
keras/src/utils/module_utils.py +29 -0
keras/src/utils/progbar.py +10 -12
keras/src/utils/python_utils.py +5 -0
keras/src/utils/rng_utils.py +9 -1
keras/src/utils/tracking.py +70 -5
keras/src/version.py +1 -1
{keras_nightly-3.12.0.dev2025083103.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/METADATA +16 -6
{keras_nightly-3.12.0.dev2025083103.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/RECORD +163 -142
keras/src/quantizers/gptq_quant.py +0 -133
{keras_nightly-3.12.0.dev2025083103.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/WHEEL +0 -0
{keras_nightly-3.12.0.dev2025083103.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/top_level.txt +0 -0

keras/src/ops/operation.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import inspect
-import os.path
 import textwrap
 from keras.src import backend
@@ -20,10 +19,10 @@ class Operation(KerasSaveable):
     def __init__(self, name=None):
         if name is None:
             name = auto_name(self.__class__.__name__)
-        if not isinstance(name, str) or os.path.sep in name:
+        if not isinstance(name, str) or "/" in name:
             raise ValueError(
                 "Argument `name` must be a string and "
-                f"cannot contain character `{os.path.sep}`. "
+                f"cannot contain character `/`. "
                 f"Received: name={name} (of type {type(name)})"
             )
         self.name = name
@@ -130,15 +129,55 @@ class Operation(KerasSaveable):
                 vars(instance)["_object__state"] = nnx.object.ObjectState()
         # Generate a config to be returned by default by `get_config()`.
-        arg_names = inspect.getfullargspec(cls.__init__).args
-        kwargs.update(dict(zip(arg_names[1 : len(args) + 1], args)))
+        auto_config = True
+        signature = inspect.signature(cls.__init__)
+        argspec = inspect.getfullargspec(cls.__init__)
+        try:
+            bound_parameters = signature.bind(None, *args, **kwargs)
+        except TypeError:
+            # Raised by signature.bind when the supplied args and kwargs
+            # do not match the signature.
+            auto_config = False
+        if auto_config and any(
+            [
+                param.kind == inspect.Parameter.POSITIONAL_ONLY
+                for name, param in signature.parameters.items()
+                if name != argspec.args[0]
+            ]
+        ):
+            # cls.__init__ takes positional only arguments, which
+            # cannot be restored via cls(**config)
+            auto_config = False
+            # Create variable to show appropriate warning in get_config.
+            instance._auto_config_error_args = True
+        if auto_config:
+            # Include default values in the config.
+            bound_parameters.apply_defaults()
+            # Extract all arguments as a dictionary.
+            kwargs = bound_parameters.arguments
+            # Expand variable kwargs argument.
+            kwargs |= kwargs.pop(argspec.varkw, {})
+            # Remove first positional argument, self.
+            kwargs.pop(argspec.args[0])
+            # Remove argument "name", as it is provided by get_config.
+            kwargs.pop("name", None)
+            if argspec.varargs is not None:
+                # Varargs cannot be meaningfully converted to a dictionary.
+                varargs = kwargs.pop(argspec.varargs)
+                if len(varargs) > 0:
+                    auto_config = False
+                    # Store variable to show appropriate warning in get_config.
+                    instance._auto_config_error_args = True
         # For safety, we only rely on auto-configs for a small set of
         # serializable types.
         supported_types = (str, int, float, bool, type(None))
         try:
             flat_arg_values = tree.flatten(kwargs)
-            auto_config = True
             for value in flat_arg_values:
                 if not isinstance(value, supported_types):
                     auto_config = False
@@ -193,30 +232,52 @@ class Operation(KerasSaveable):
                 config.pop("name", None)
             return config
         else:
-            raise NotImplementedError(
-                textwrap.dedent(
-                    f"""
-        Object {self.__class__.__name__} was created by passing
-        non-serializable argument values in `__init__()`,
-        and therefore the object must override `get_config()` in
-        order to be serializable. Please implement `get_config()`.
-        Example:
-        class CustomLayer(keras.layers.Layer):
-            def __init__(self, arg1, arg2, **kwargs):
-                super().__init__(**kwargs)
-                self.arg1 = arg1
-                self.arg2 = arg2
-            def get_config(self):
-                config = super().get_config()
-                config.update({"arg1": self.arg1,
-                    "arg2": self.arg2,
-                })
-                return config"""
+            example_str = """
+            class CustomLayer(keras.layers.Layer):
+                def __init__(self, arg1, arg2, **kwargs):
+                    super().__init__(**kwargs)
+                    self.arg1 = arg1
+                    self.arg2 = arg2
+                def get_config(self):
+                    config = super().get_config()
+                    config.update({
+                        "arg1": self.arg1,
+                        "arg2": self.arg2,
+                    })
+                    return config
+            """
+            if getattr(self, "_auto_config_error_args", False):
+                raise NotImplementedError(
+                    textwrap.dedent(
+                        f"""
+            Object {self.__class__.__name__} was created by passing
+            positional only or variadic positional arguments (e.g.,
+            `*args`) to `__init__()`, which is not supported by the
+            automatic config generation. Please remove all positional
+            only and variadic arguments from `__init__()`
+            or override `get_config()` and `from_config()` to make
+            the object serializatble.
+            Example:
+            {example_str}"""
+                    )
+                )
+            else:
+                raise NotImplementedError(
+                    textwrap.dedent(
+                        f"""
+            Object {self.__class__.__name__} was created by passing
+            non-serializable argument values in `__init__()`,
+            and therefore the object must override `get_config()` in
+            order to be serializable. Please implement `get_config()`.
+            Example:
+            {example_str}"""
+                    )
                 )
-            )
     @classmethod
     def from_config(cls, config):

keras/src/ops/operation_utils.py CHANGED Viewed

@@ -378,6 +378,8 @@ def reduce_shape(shape, axis=None, keepdims=False):
     elif isinstance(axis, int):
         axis = (axis,)
+    axis = tuple(canonicalize_axis(a, len(shape)) for a in axis)
     if keepdims:
         for ax in axis:
             shape[ax] = 1

keras/src/optimizers/adafactor.py CHANGED Viewed

@@ -158,33 +158,52 @@ class Adafactor(optimizer.Optimizer):
         rho_t = ops.minimum(lr, 1 / ops.sqrt(local_step))
         alpha_t = ops.maximum(epsilon_2, self._rms(variable)) * rho_t
         regulated_grad_square = ops.add(ops.square(gradient), self.epsilon_1)
-        beta_2_t = 1 - ops.power(local_step, self.beta_2_decay)
+        beta_2_t = ops.subtract(1, ops.power(local_step, self.beta_2_decay))
         if len(variable.shape) >= 2:
             # `r` deletes the last dimension of gradient, so it is of shape
             # `gradient.shape[:-1]`.
             self.assign(
                 r,
-                beta_2_t * r
-                + (1 - beta_2_t) * ops.mean(regulated_grad_square, axis=-1),
+                ops.add(
+                    ops.multiply(beta_2_t, r),
+                    ops.multiply(
+                        ops.subtract(1, beta_2_t),
+                        ops.mean(regulated_grad_square, axis=-1),
+                    ),
+                ),
             )
             # `c` deletes the second last dimension of gradient, so it is of
             # shape `gradient.shape[:-2] + gradient.shape[-1]`.
             self.assign(
                 c,
-                beta_2_t * c
-                + (1 - beta_2_t) * ops.mean(regulated_grad_square, axis=-2),
+                ops.add(
+                    ops.multiply(beta_2_t, c),
+                    ops.multiply(
+                        ops.subtract(1, beta_2_t),
+                        ops.mean(regulated_grad_square, axis=-2),
+                    ),
+                ),
             )
             self.assign(
                 v,
-                ops.expand_dims(
-                    r / ops.mean(r, axis=-1, keepdims=True), axis=-1
-                )
-                * ops.expand_dims(c, -2),
+                ops.multiply(
+                    ops.expand_dims(
+                        ops.divide(r, ops.mean(r, axis=-1, keepdims=True)),
+                        axis=-1,
+                    ),
+                    ops.expand_dims(c, -2),
+                ),
             )
         else:
             self.assign(
-                v, beta_2_t * v + (1 - beta_2_t) * regulated_grad_square
+                v,
+                ops.add(
+                    ops.multiply(beta_2_t, v),
+                    ops.multiply(
+                        ops.subtract(1, beta_2_t), regulated_grad_square
+                    ),
+                ),
             )
         u_t = ops.divide(gradient, ops.sqrt(v))

keras/src/optimizers/base_optimizer.py CHANGED Viewed

@@ -631,6 +631,20 @@ class BaseOptimizer(KerasSaveable):
             g_acc.assign(n_g_acc)
     def stateless_apply(self, optimizer_variables, grads, trainable_variables):
+        """Stateless version of `apply` that returns modified variables.
+        Args:
+            optimizer_variables: list of tensors containing the current values
+                for the optimizer variables. These are native tensors and not
+                `keras.Variable`s.
+            grads: list of gradients to apply.
+            trainable_variables: list of tensors containing the current values
+                for the model variables. These are native tensors and not
+                `keras.Variable`s.
+        Returns: A tuple containing two list of tensors, the updated
+            `trainable_variables` and the updated `optimizer_variables`.
+        """
         self._check_super_called()
         if not self.built:
@@ -969,10 +983,15 @@ class BaseOptimizer(KerasSaveable):
             ):
                 if average is not None:
                     not_first_step = ops.not_equal(self.iterations, 0)
-                    momentum = (
-                        ops.cast(not_first_step, var.dtype) * self.ema_momentum
+                    momentum = ops.multiply(
+                        ops.cast(not_first_step, var.dtype), self.ema_momentum
+                    )
+                    average.assign(
+                        ops.add(
+                            ops.multiply(momentum, average),
+                            ops.multiply(ops.subtract(1, momentum), var),
+                        )
                     )
-                    average.assign(momentum * average + (1 - momentum) * var)
     def _overwrite_model_variables_with_average_value(
         self, trainable_variables

keras/src/optimizers/loss_scale_optimizer.py CHANGED Viewed

@@ -48,6 +48,7 @@ class LossScaleOptimizer(optimizer.Optimizer):
         inner_optimizer,
         initial_scale=2.0**15,
         dynamic_growth_steps=2000,
+        name=None,
         **kwargs,
     ):
         if not kwargs.pop("dynamic", True):
@@ -56,7 +57,42 @@ class LossScaleOptimizer(optimizer.Optimizer):
                 "Instead, simply set `loss_scale_factor` directly on the "
                 "`inner_optimizer`."
             )
-        super().__init__(learning_rate=0.0, **kwargs)
+        # Backwards compatibility code for deserialization.
+        # LossScaleOptimizer used to return all these parameters in `get_config`
+        # from `super.get_config` even though they are all non-functional. We
+        # no longer let user set them, but we have to allow the default values
+        # to be passed during deserialization to support older models.
+        base_optimizer_defaults = {
+            "weight_decay": None,
+            "clipnorm": None,
+            "global_clipnorm": None,
+            "clipvalue": None,
+            "use_ema": False,
+            "ema_momentum": 0.99,
+            "ema_overwrite_frequency": None,
+            "loss_scale_factor": None,
+            "gradient_accumulation_steps": None,
+        }
+        for arg_name, default_value in base_optimizer_defaults.items():
+            if arg_name not in kwargs:
+                continue
+            arg_value = kwargs.pop(arg_name)
+            if (
+                default_value is None and arg_value is not None
+            ) or arg_value != default_value:
+                raise ValueError(
+                    f"LossScaleOptimizer does not support `{arg_name}`. "
+                    f"Instead, set `{arg_name}` on the `inner_optimizer`."
+                )
+        if kwargs:
+            raise ValueError(
+                "LossScaleOptimizer does not support arguments: "
+                f"`{'`, `'.join(kwargs.keys())}`."
+            )
+        super().__init__(learning_rate=0.0, name=name)
         self.inner_optimizer = inner_optimizer
         self.initial_scale = initial_scale
         self.dynamic_growth_steps = dynamic_growth_steps
@@ -81,7 +117,7 @@ class LossScaleOptimizer(optimizer.Optimizer):
             name="dynamic_scale",
         )
         self.inner_optimizer.build(var_list)
-        self.built = True
+        super().build(var_list)
     @property
     def variables(self):
@@ -112,7 +148,7 @@ class LossScaleOptimizer(optimizer.Optimizer):
             mapping = list(zip(self.variables, optimizer_variables))
             with backend.StatelessScope(state_mapping=mapping) as scope:
                 self.step_counter.assign(0)
-                self.dynamic_scale.assign(self.dynamic_scale * 2.0)
+                self.dynamic_scale.assign(ops.multiply(self.dynamic_scale, 2.0))
             return [scope.get_current_value(v) for v in self._variables]
         def increment():
@@ -136,7 +172,7 @@ class LossScaleOptimizer(optimizer.Optimizer):
                 g
                 if g is None or self._overwrite_variable_with_gradient(v)
                 else ops.divide(g, scale)
-                for g, v in zip(grads, trainable_variables)
+                for g, v in zip(grads, self._trainable_variables)
             ]
             (
                 new_trainable_variables,
@@ -156,7 +192,7 @@ class LossScaleOptimizer(optimizer.Optimizer):
         mapping = list(zip(self.variables, optimizer_variables))
         with backend.StatelessScope(state_mapping=mapping) as scope:
             self.step_counter.assign(0)
-            self.dynamic_scale.assign(self.dynamic_scale / 2.0)
+            self.dynamic_scale.assign(ops.multiply(self.dynamic_scale, 0.5))
         new_optimizer_variables = []
         for v in self.variables:
             new_optimizer_variables.append(scope.get_current_value(v))
@@ -190,7 +226,7 @@ class LossScaleOptimizer(optimizer.Optimizer):
         def upscale():
             self.step_counter.assign(0)
-            self.dynamic_scale.assign(self.dynamic_scale * 2.0)
+            self.dynamic_scale.assign(ops.multiply(self.dynamic_scale, 2.0))
         def increment():
             self.step_counter.assign_add(1)
@@ -205,7 +241,7 @@ class LossScaleOptimizer(optimizer.Optimizer):
     def _stateful_handle_non_finite_grads(self):
         # If any inf or nan in grads, downscale loss and reset counter.
         self.step_counter.assign(0)
-        self.dynamic_scale.assign(self.dynamic_scale / 2.0)
+        self.dynamic_scale.assign(ops.multiply(self.dynamic_scale, 0.5))
     def _common_apply(self, grads, trainable_variables=None):
         finite = self.check_finite(grads)
@@ -278,25 +314,22 @@ class LossScaleOptimizer(optimizer.Optimizer):
     def scale_loss(self, loss):
         scale = self.dynamic_scale if self.built else self.initial_scale
-        return loss * scale
+        return ops.multiply(loss, scale)
     def finalize_variable_values(self, var_list):
         self.inner_optimizer.finalize_variable_values(var_list)
     def get_config(self):
-        config = super().get_config()
+        # Do not use super().get_config() as only "name" is supported.
         inner_optimizer_config = serialization_lib.serialize_keras_object(
             self.inner_optimizer
         )
-        config.update(
-            {
-                "inner_optimizer": inner_optimizer_config,
-                "initial_scale": self.initial_scale,
-                "dynamic_growth_steps": self.dynamic_growth_steps,
-            }
-        )
-        del config["learning_rate"]
-        return config
+        return {
+            "name": self.name,
+            "inner_optimizer": inner_optimizer_config,
+            "initial_scale": self.initial_scale,
+            "dynamic_growth_steps": self.dynamic_growth_steps,
+        }
     @classmethod
     def from_config(cls, config, custom_objects=None):

keras/src/optimizers/muon.py CHANGED Viewed

@@ -20,7 +20,7 @@ class Muon(optimizer.Optimizer):
     The Muon optimizer can use both the Muon update step or the
     AdamW update step based on the following:
-    - For any variable that isn't 2D, 3D or 4D, the AdamW step
+    - For any variable that isn't 2D, the AdamW step
         will be used. This is not configurable.
     - If the argument `exclude_embeddings` (defaults to `True`) is set
     to `True`, the AdamW step will be used.
@@ -46,10 +46,12 @@ class Muon(optimizer.Optimizer):
             that takes no arguments and returns the actual value to use.
             The exponential decay rate for the 1st moment estimates. Defaults to
             `0.9`.
-        adam_beta_2: A float value or a constant float tensor, ora callable
+        adam_beta_2: A float value or a constant float tensor, or a callable
             that takes no arguments and returns the actual value to use.
             The exponential decay rate for the 2nd moment estimates. Defaults to
             `0.999`.
+        adam_weight_decay: Float. If set, weight decay is applied when using
+            the Adam optimizer.
         epsilon: A small constant for numerical stability. This is
             "epsilon hat" in the Kingma and Ba paper
             (in the formula just before Section 2.1),
@@ -67,11 +69,15 @@ class Muon(optimizer.Optimizer):
             It is recommended to use the default value
         adam_lr_ratio: Float, the ratio of the learning rate when
                 using Adam to the main learning rate.
-                it is recommended to set it to 0.1
+                It is recommended to set it to 1
         momentum: Float, momentum used by internal SGD.
         ns_steps: Integer, number of Newton-Schulz iterations to run.
         nesterov: Boolean, whether to use Nesterov-style momentum
         {{base_optimizer_keyword_args}}
+        rms_rate: Float. A parameter from https://arxiv.org/abs/2502.16982
+            that can enhance the stability of Muon, allowing it to use the
+            same learning rate and weight decay as Adam. Defaults to `0.2`.
+            Set to `None` to disable this feature.
     """
     def __init__(
@@ -79,8 +85,9 @@ class Muon(optimizer.Optimizer):
         learning_rate=0.001,
         adam_beta_1=0.9,
         adam_beta_2=0.999,
+        adam_weight_decay=0.004,
         epsilon=1e-7,
-        weight_decay=0.1,
+        weight_decay=0.004,
         clipnorm=None,
         clipvalue=None,
         global_clipnorm=None,
@@ -95,10 +102,11 @@ class Muon(optimizer.Optimizer):
         muon_a=3.4445,
         muon_b=-4.7750,
         muon_c=2.0315,
-        adam_lr_ratio=0.1,
+        adam_lr_ratio=1,
         momentum=0.95,
-        ns_steps=6,
+        ns_steps=5,
         nesterov=True,
+        rms_rate=0.2,
         **kwargs,
     ):
         super().__init__(
@@ -127,12 +135,13 @@ class Muon(optimizer.Optimizer):
         self.nesterov = nesterov
         self.exclude_embeddings = exclude_embeddings
         self.exclude_layers = exclude_layers or []
+        self.adam_weight_decay = adam_weight_decay
+        self.rms_rate = rms_rate
     def _should_use_adamw(self, variable):
-        # To use it with 4D convolutional filters,
         # it works well to just flatten their last 3 dimensions.
         # any {0,1}-D parameters should all be optimized by adam
-        if not 1 < len(variable.shape) < 4:
+        if len(variable.shape) != 2:
             return True
         if self.exclude_embeddings and "embedding" in variable.path.lower():
             return True
@@ -153,52 +162,50 @@ class Muon(optimizer.Optimizer):
         if self.built:
             return
         super().build(var_list)
-        self.adam_momentums = {}
-        self.adam_velocities = {}
-        self.muon_momentums = {}
-        self.muon_velocities = {}
+        # Momentums are for both Muon and Adam
+        self.momentums = [None] * len(var_list)
+        # Velocities are just for Adam
+        self.adam_velocities = [None] * len(var_list)
         for var in var_list:
             if not self._overwrite_variable_with_gradient(var):
-                self.adam_momentums[var.path] = (
+                self.momentums[self._get_variable_index(var)] = (
                     self.add_variable_from_reference(
                         reference_variable=var, name="momentum"
                     )
                 )
                 if self._should_use_adamw(var):
-                    self.adam_velocities[var.path] = (
+                    self.adam_velocities[self._get_variable_index(var)] = (
                         self.add_variable_from_reference(
                             reference_variable=var, name="velocity"
                         )
                     )
     def update_step(self, gradient, variable, learning_rate):
-        if self._should_use_adamw(variable):
+        variable_index = self._get_variable_index(variable)
+        m = self.momentums[variable_index]
+        v = self.adam_velocities[variable_index]
+        # The presence of the velocity tells us that this variable is for Adam
+        if v is not None:
             # It should be noted that lr is one-tenth when using adamw.
             self._adamw_update_step(
-                gradient, variable, learning_rate * self.adam_lr_ratio
+                gradient, variable, learning_rate * self.adam_lr_ratio, m, v
             )
         else:
-            self._muon_update_step(gradient, variable, learning_rate)
+            self._muon_update_step(gradient, variable, learning_rate, m)
-    def _muon_update_step(self, gradient, variable, lr):
-        m = self.adam_momentums[variable.path]
+    def _muon_update_step(self, gradient, variable, lr, m):
         self.assign_add(m, ops.add(gradient, m * (self.momentum - 1)))
-        shape = variable.shape
         if self.nesterov:
             g = ops.add(gradient, self.momentum * m)
         else:
             g = m
+        update = self.zeropower_via_newtonschulz5(g, self.ns_steps)
-        self.assign_sub(
-            variable,
-            lr
-            * self.zeropower_via_newtonschulz5(g, self.ns_steps)
-            * max(1, shape[0] / shape[1]) ** 0.5,
-        )
+        self.assign_sub(variable, self.lr_adjust(lr * update))
-    def _adamw_update_step(self, gradient, variable, learning_rate):
+    def _adamw_update_step(self, gradient, variable, learning_rate, m, v):
         """Update step given gradient and the associated model variable."""
         lr = ops.cast(learning_rate, variable.dtype)
         gradient = ops.cast(gradient, variable.dtype)
@@ -210,9 +217,6 @@ class Muon(optimizer.Optimizer):
             ops.cast(self.adam_beta_2, variable.dtype), local_step
         )
-        m = self.adam_momentums[variable.path]
-        v = self.adam_velocities[variable.path]
         alpha = lr * ops.sqrt(1 - adam_beta_2_power) / (1 - adam_beta_1_power)
         self.assign_add(
@@ -239,6 +243,20 @@ class Muon(optimizer.Optimizer):
         X = ops.transpose(X, temp_order)
         return X
+    def lr_adjust(self, x):
+        """Adjusts learning rate based on the Moonlight implementation.
+        This method enhances the stability of Muon, allowing it to use the same
+        learning rate and weight decay as Adam. For details, see
+        https://arxiv.org/abs/2502.16982.
+        For a 2D matrix, the update is scaled by `sqrt(max(n, m)) * rms_rate`,
+        where `n` and `m` are the dimensions of the matrix.
+        """
+        if self.rms_rate is None:
+            return x
+        # moonlight version
+        # https://github.com/MoonshotAI/Moonlight/blob/master/examples/toy_train.py
+        return x * ops.sqrt(ops.maximum(x.shape[0], x.shape[1])) * self.rms_rate
     def zeropower_via_newtonschulz5(self, x, steps: int):
         """We apply the Newton-Schulz iteration to compute matrix G.
@@ -268,6 +286,20 @@ class Muon(optimizer.Optimizer):
             x = self.transpose_last_axis(x)
         return x
+    def _apply_weight_decay(self, variables):
+        for variable in variables:
+            if not self._use_weight_decay(variable):
+                continue
+            if self._should_use_adamw(variable):
+                weight_decay_value = self.adam_weight_decay
+            else:
+                weight_decay_value = self.weight_decay
+            if weight_decay_value is None:
+                continue
+            wd = ops.cast(weight_decay_value, variable.dtype)
+            lr = ops.cast(self.learning_rate, variable.dtype)
+            variable.assign(variable - variable * wd * lr)
     def get_config(self):
         config = super().get_config()
         config.update(
@@ -284,6 +316,8 @@ class Muon(optimizer.Optimizer):
                 "ns_steps": self.ns_steps,
                 "nesterov": self.nesterov,
                 "exclude_embeddings": self.exclude_embeddings,
+                "adam_weight_decay": self.adam_weight_decay,
+                "rms_rate": self.rms_rate,
             }
         )
         return config

keras/src/optimizers/schedules/learning_rate_schedule.py CHANGED Viewed

@@ -584,9 +584,10 @@ class CosineDecay(LearningRateSchedule):
     schedule applies a linear increase per optimizer step to our learning rate
     from `initial_learning_rate` to `warmup_target` for a duration of
     `warmup_steps`. Afterwards, it applies a cosine decay function taking our
-    learning rate from `warmup_target` to `alpha` for a duration of
-    `decay_steps`. If `warmup_target` is None we skip warmup and our decay
-    will take our learning rate from `initial_learning_rate` to `alpha`.
+    learning rate from `warmup_target` to `warmup_target * alpha` for a
+    duration of `decay_steps`. If `warmup_target` is None we skip warmup and
+    our decay will take our learning rate from `initial_learning_rate` to
+    `initial_learning_rate * alpha`.
     It requires a `step` value to  compute the learning rate. You can
     just pass a backend variable that you increment at each training step.

keras/src/quantizers/__init__.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import inspect
 from keras.src.api_export import keras_export
+from keras.src.quantizers.awq_config import AWQConfig
+from keras.src.quantizers.quantization_config import Float8QuantizationConfig
+from keras.src.quantizers.quantization_config import Int4QuantizationConfig
+from keras.src.quantizers.quantization_config import Int8QuantizationConfig
+from keras.src.quantizers.quantization_config import QuantizationConfig
 from keras.src.quantizers.quantizers import AbsMaxQuantizer
 from keras.src.quantizers.quantizers import Quantizer
 from keras.src.quantizers.quantizers import abs_max_quantize
@@ -13,7 +18,15 @@ from keras.src.quantizers.quantizers import unpack_int4
 from keras.src.saving import serialization_lib
 from keras.src.utils.naming import to_snake_case
-ALL_OBJECTS = {Quantizer, AbsMaxQuantizer}
+ALL_OBJECTS = {
+    Quantizer,
+    AbsMaxQuantizer,
+    QuantizationConfig,
+    Int8QuantizationConfig,
+    Int4QuantizationConfig,
+    Float8QuantizationConfig,
+    AWQConfig,
+}
 ALL_OBJECTS_DICT = {cls.__name__: cls for cls in ALL_OBJECTS}
 ALL_OBJECTS_DICT.update(
     {to_snake_case(cls.__name__): cls for cls in ALL_OBJECTS}

keras-nightly 3.12.0.dev2025083103__py3-none-any.whl → 3.14.0.dev2026011604__py3-none-any.whl

keras-nightly 3.12.0.dev2025083103py3-none-any.whl → 3.14.0.dev2026011604py3-none-any.whl