PyPI - keras-nightly - Versions diffs - 3.12.0.dev2025100503__py3-none-any.whl → 3.14.0.dev2026011604__py3-none-any.whl - Mend

keras-nightly 3.12.0.dev2025100503py3-none-any.whl → 3.14.0.dev2026011604py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

keras/__init__.py +1 -0
keras/_tf_keras/keras/__init__.py +1 -0
keras/_tf_keras/keras/callbacks/__init__.py +3 -0
keras/_tf_keras/keras/distillation/__init__.py +16 -0
keras/_tf_keras/keras/distribution/__init__.py +3 -0
keras/_tf_keras/keras/dtype_policies/__init__.py +3 -0
keras/_tf_keras/keras/layers/__init__.py +21 -0
keras/_tf_keras/keras/ops/__init__.py +13 -0
keras/_tf_keras/keras/ops/image/__init__.py +1 -0
keras/_tf_keras/keras/ops/linalg/__init__.py +1 -0
keras/_tf_keras/keras/ops/nn/__init__.py +3 -0
keras/_tf_keras/keras/ops/numpy/__init__.py +9 -0
keras/_tf_keras/keras/quantizers/__init__.py +13 -0
keras/callbacks/__init__.py +3 -0
keras/distillation/__init__.py +16 -0
keras/distribution/__init__.py +3 -0
keras/dtype_policies/__init__.py +3 -0
keras/layers/__init__.py +21 -0
keras/ops/__init__.py +13 -0
keras/ops/image/__init__.py +1 -0
keras/ops/linalg/__init__.py +1 -0
keras/ops/nn/__init__.py +3 -0
keras/ops/numpy/__init__.py +9 -0
keras/quantizers/__init__.py +13 -0
keras/src/applications/imagenet_utils.py +4 -1
keras/src/backend/common/backend_utils.py +30 -6
keras/src/backend/common/name_scope.py +2 -1
keras/src/backend/common/variables.py +30 -15
keras/src/backend/jax/core.py +92 -3
keras/src/backend/jax/distribution_lib.py +16 -2
keras/src/backend/jax/linalg.py +4 -0
keras/src/backend/jax/nn.py +509 -29
keras/src/backend/jax/numpy.py +59 -8
keras/src/backend/jax/trainer.py +14 -2
keras/src/backend/numpy/linalg.py +4 -0
keras/src/backend/numpy/nn.py +311 -1
keras/src/backend/numpy/numpy.py +65 -2
keras/src/backend/openvino/__init__.py +1 -0
keras/src/backend/openvino/core.py +2 -23
keras/src/backend/openvino/linalg.py +4 -0
keras/src/backend/openvino/nn.py +271 -20
keras/src/backend/openvino/numpy.py +943 -189
keras/src/backend/tensorflow/layer.py +43 -9
keras/src/backend/tensorflow/linalg.py +24 -0
keras/src/backend/tensorflow/nn.py +545 -1
keras/src/backend/tensorflow/numpy.py +250 -50
keras/src/backend/torch/core.py +3 -1
keras/src/backend/torch/linalg.py +4 -0
keras/src/backend/torch/nn.py +125 -0
keras/src/backend/torch/numpy.py +80 -2
keras/src/callbacks/__init__.py +1 -0
keras/src/callbacks/model_checkpoint.py +5 -0
keras/src/callbacks/orbax_checkpoint.py +332 -0
keras/src/callbacks/terminate_on_nan.py +54 -5
keras/src/datasets/cifar10.py +5 -0
keras/src/distillation/__init__.py +1 -0
keras/src/distillation/distillation_loss.py +390 -0
keras/src/distillation/distiller.py +598 -0
keras/src/distribution/distribution_lib.py +14 -0
keras/src/dtype_policies/__init__.py +2 -0
keras/src/dtype_policies/dtype_policy.py +90 -1
keras/src/export/__init__.py +2 -0
keras/src/export/export_utils.py +39 -2
keras/src/export/litert.py +248 -0
keras/src/export/openvino.py +1 -1
keras/src/export/tf2onnx_lib.py +3 -0
keras/src/layers/__init__.py +13 -0
keras/src/layers/activations/softmax.py +9 -4
keras/src/layers/attention/multi_head_attention.py +4 -1
keras/src/layers/core/dense.py +241 -111
keras/src/layers/core/einsum_dense.py +316 -131
keras/src/layers/core/embedding.py +84 -94
keras/src/layers/core/input_layer.py +1 -0
keras/src/layers/core/reversible_embedding.py +399 -0
keras/src/layers/input_spec.py +17 -17
keras/src/layers/layer.py +45 -15
keras/src/layers/merging/dot.py +4 -1
keras/src/layers/pooling/adaptive_average_pooling1d.py +65 -0
keras/src/layers/pooling/adaptive_average_pooling2d.py +62 -0
keras/src/layers/pooling/adaptive_average_pooling3d.py +63 -0
keras/src/layers/pooling/adaptive_max_pooling1d.py +65 -0
keras/src/layers/pooling/adaptive_max_pooling2d.py +62 -0
keras/src/layers/pooling/adaptive_max_pooling3d.py +63 -0
keras/src/layers/pooling/base_adaptive_pooling.py +63 -0
keras/src/layers/preprocessing/discretization.py +6 -5
keras/src/layers/preprocessing/feature_space.py +8 -4
keras/src/layers/preprocessing/image_preprocessing/aug_mix.py +2 -2
keras/src/layers/preprocessing/image_preprocessing/random_contrast.py +3 -3
keras/src/layers/preprocessing/image_preprocessing/resizing.py +10 -0
keras/src/layers/preprocessing/index_lookup.py +19 -1
keras/src/layers/preprocessing/normalization.py +14 -1
keras/src/layers/regularization/dropout.py +43 -1
keras/src/layers/rnn/rnn.py +19 -0
keras/src/losses/loss.py +1 -1
keras/src/losses/losses.py +24 -0
keras/src/metrics/confusion_metrics.py +7 -6
keras/src/models/cloning.py +4 -0
keras/src/models/functional.py +11 -3
keras/src/models/model.py +172 -34
keras/src/ops/image.py +257 -20
keras/src/ops/linalg.py +93 -0
keras/src/ops/nn.py +258 -0
keras/src/ops/numpy.py +569 -36
keras/src/optimizers/muon.py +65 -31
keras/src/optimizers/schedules/learning_rate_schedule.py +4 -3
keras/src/quantizers/__init__.py +14 -1
keras/src/quantizers/awq.py +361 -0
keras/src/quantizers/awq_config.py +140 -0
keras/src/quantizers/awq_core.py +217 -0
keras/src/quantizers/gptq.py +2 -8
keras/src/quantizers/gptq_config.py +36 -1
keras/src/quantizers/gptq_core.py +65 -79
keras/src/quantizers/quantization_config.py +246 -0
keras/src/quantizers/quantizers.py +127 -61
keras/src/quantizers/utils.py +23 -0
keras/src/random/seed_generator.py +6 -4
keras/src/saving/file_editor.py +81 -6
keras/src/saving/orbax_util.py +26 -0
keras/src/saving/saving_api.py +37 -14
keras/src/saving/saving_lib.py +1 -1
keras/src/testing/__init__.py +1 -0
keras/src/testing/test_case.py +45 -5
keras/src/utils/backend_utils.py +31 -4
keras/src/utils/dataset_utils.py +234 -35
keras/src/utils/file_utils.py +49 -11
keras/src/utils/image_utils.py +14 -2
keras/src/utils/jax_layer.py +244 -55
keras/src/utils/module_utils.py +29 -0
keras/src/utils/progbar.py +10 -2
keras/src/utils/rng_utils.py +9 -1
keras/src/utils/tracking.py +5 -5
keras/src/version.py +1 -1
{keras_nightly-3.12.0.dev2025100503.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/METADATA +16 -6
{keras_nightly-3.12.0.dev2025100503.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/RECORD +136 -115
{keras_nightly-3.12.0.dev2025100503.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/WHEEL +0 -0
{keras_nightly-3.12.0.dev2025100503.dist-info → keras_nightly-3.14.0.dev2026011604.dist-info}/top_level.txt +0 -0

keras/src/backend/jax/nn.py CHANGED Viewed

@@ -16,6 +16,9 @@ from jax.experimental.pallas.ops.tpu.splash_attention import (
 )
 from keras.src import backend
+from keras.src.backend.common.backend_utils import (
+    compute_adaptive_pooling_window_sizes,
+)
 from keras.src.backend.common.backend_utils import (
     compute_conv_transpose_padding_args_for_jax,
 )
@@ -289,6 +292,403 @@ def average_pool(
         return pooled / window_counts
+def _compute_adaptive_pooling_gather_indices(
+    input_dim, output_size, big_window
+):
+    """Compute gather indices for Two-Pool Gather method."""
+    window_starts = jnp.floor(
+        (jnp.arange(output_size) * input_dim) / output_size
+    ).astype(jnp.int32)
+    window_ends = jnp.ceil(
+        (jnp.arange(1, output_size + 1) * input_dim) / output_size
+    ).astype(jnp.int32)
+    window_sizes = window_ends - window_starts
+    is_big = window_sizes == big_window
+    small_window = big_window - 1
+    small_len = input_dim - small_window + 1
+    small_indices = window_starts
+    big_indices = window_starts + small_len
+    gather = jnp.where(is_big, big_indices, small_indices)
+    return gather.astype(jnp.int32)
+def _adaptive_average_pool1d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size,)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 1))  # NCL → NLC
+    n, l, c = inputs.shape
+    out_l = output_size[0]
+    small, big = compute_adaptive_pooling_window_sizes(l, out_l)
+    gather = _compute_adaptive_pooling_gather_indices(l, out_l, big)
+    small_pool = (
+        lax.reduce_window(
+            inputs, 0.0, lax.add, (1, small, 1), (1, 1, 1), "valid"
+        )
+        / small
+    )
+    big_pool = (
+        lax.reduce_window(inputs, 0.0, lax.add, (1, big, 1), (1, 1, 1), "valid")
+        / big
+    )
+    combined = jnp.concatenate([small_pool, big_pool], axis=1)
+    out = jnp.take(combined, gather, axis=1)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 2, 1))
+    return out
+def _adaptive_max_pool1d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size,)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 1))
+    n, l, c = inputs.shape
+    out_l = output_size[0]
+    small, big = compute_adaptive_pooling_window_sizes(l, out_l)
+    gather = _compute_adaptive_pooling_gather_indices(l, out_l, big)
+    small_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, small, 1), (1, 1, 1), "valid"
+    )
+    big_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, big, 1), (1, 1, 1), "valid"
+    )
+    combined = jnp.concatenate([small_pool, big_pool], axis=1)
+    out = jnp.take(combined, gather, axis=1)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 2, 1))
+    return out
+def _adaptive_average_pool2d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 3, 1))
+    n, h, w, c = inputs.shape
+    out_h, out_w = output_size
+    small_h, big_h = compute_adaptive_pooling_window_sizes(h, out_h)
+    gather_h = _compute_adaptive_pooling_gather_indices(h, out_h, big_h)
+    small_w, big_w = compute_adaptive_pooling_window_sizes(w, out_w)
+    gather_w = _compute_adaptive_pooling_gather_indices(w, out_w, big_w)
+    small_h_pool = (
+        lax.reduce_window(
+            inputs, 0.0, lax.add, (1, small_h, 1, 1), (1, 1, 1, 1), "valid"
+        )
+        / small_h
+    )
+    big_h_pool = (
+        lax.reduce_window(
+            inputs, 0.0, lax.add, (1, big_h, 1, 1), (1, 1, 1, 1), "valid"
+        )
+        / big_h
+    )
+    combined_h = jnp.concatenate([small_h_pool, big_h_pool], axis=1)
+    pooled_h = jnp.take(combined_h, gather_h, axis=1)
+    small_w_pool = (
+        lax.reduce_window(
+            pooled_h, 0.0, lax.add, (1, 1, small_w, 1), (1, 1, 1, 1), "valid"
+        )
+        / small_w
+    )
+    big_w_pool = (
+        lax.reduce_window(
+            pooled_h, 0.0, lax.add, (1, 1, big_w, 1), (1, 1, 1, 1), "valid"
+        )
+        / big_w
+    )
+    combined_w = jnp.concatenate([small_w_pool, big_w_pool], axis=2)
+    out = jnp.take(combined_w, gather_w, axis=2)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 3, 1, 2))
+    return out
+def _adaptive_max_pool2d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 3, 1))
+    n, h, w, c = inputs.shape
+    out_h, out_w = output_size
+    small_h, big_h = compute_adaptive_pooling_window_sizes(h, out_h)
+    gather_h = _compute_adaptive_pooling_gather_indices(h, out_h, big_h)
+    small_w, big_w = compute_adaptive_pooling_window_sizes(w, out_w)
+    gather_w = _compute_adaptive_pooling_gather_indices(w, out_w, big_w)
+    small_h_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, small_h, 1, 1), (1, 1, 1, 1), "valid"
+    )
+    big_h_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, big_h, 1, 1), (1, 1, 1, 1), "valid"
+    )
+    combined_h = jnp.concatenate([small_h_pool, big_h_pool], axis=1)
+    pooled_h = jnp.take(combined_h, gather_h, axis=1)
+    small_w_pool = lax.reduce_window(
+        pooled_h, -jnp.inf, lax.max, (1, 1, small_w, 1), (1, 1, 1, 1), "valid"
+    )
+    big_w_pool = lax.reduce_window(
+        pooled_h, -jnp.inf, lax.max, (1, 1, big_w, 1), (1, 1, 1, 1), "valid"
+    )
+    combined_w = jnp.concatenate([small_w_pool, big_w_pool], axis=2)
+    out = jnp.take(combined_w, gather_w, axis=2)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 3, 1, 2))
+    return out
+def _adaptive_average_pool3d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size, output_size)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 3, 4, 1))
+    n, d, h, w, c = inputs.shape
+    out_d, out_h, out_w = output_size
+    small_d, big_d = compute_adaptive_pooling_window_sizes(d, out_d)
+    gather_d = _compute_adaptive_pooling_gather_indices(d, out_d, big_d)
+    small_h, big_h = compute_adaptive_pooling_window_sizes(h, out_h)
+    gather_h = _compute_adaptive_pooling_gather_indices(h, out_h, big_h)
+    small_w, big_w = compute_adaptive_pooling_window_sizes(w, out_w)
+    gather_w = _compute_adaptive_pooling_gather_indices(w, out_w, big_w)
+    small_d_pool = (
+        lax.reduce_window(
+            inputs,
+            0.0,
+            lax.add,
+            (1, small_d, 1, 1, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / small_d
+    )
+    big_d_pool = (
+        lax.reduce_window(
+            inputs, 0.0, lax.add, (1, big_d, 1, 1, 1), (1, 1, 1, 1, 1), "valid"
+        )
+        / big_d
+    )
+    combined_d = jnp.concatenate([small_d_pool, big_d_pool], axis=1)
+    pooled_d = jnp.take(combined_d, gather_d, axis=1)
+    small_h_pool = (
+        lax.reduce_window(
+            pooled_d,
+            0.0,
+            lax.add,
+            (1, 1, small_h, 1, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / small_h
+    )
+    big_h_pool = (
+        lax.reduce_window(
+            pooled_d,
+            0.0,
+            lax.add,
+            (1, 1, big_h, 1, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / big_h
+    )
+    combined_h = jnp.concatenate([small_h_pool, big_h_pool], axis=2)
+    pooled_h = jnp.take(combined_h, gather_h, axis=2)
+    small_w_pool = (
+        lax.reduce_window(
+            pooled_h,
+            0.0,
+            lax.add,
+            (1, 1, 1, small_w, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / small_w
+    )
+    big_w_pool = (
+        lax.reduce_window(
+            pooled_h,
+            0.0,
+            lax.add,
+            (1, 1, 1, big_w, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / big_w
+    )
+    combined_w = jnp.concatenate([small_w_pool, big_w_pool], axis=3)
+    out = jnp.take(combined_w, gather_w, axis=3)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 4, 1, 2, 3))
+    return out
+def _adaptive_max_pool3d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size, output_size)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 3, 4, 1))
+    n, d, h, w, c = inputs.shape
+    out_d, out_h, out_w = output_size
+    small_d, big_d = compute_adaptive_pooling_window_sizes(d, out_d)
+    gather_d = _compute_adaptive_pooling_gather_indices(d, out_d, big_d)
+    small_h, big_h = compute_adaptive_pooling_window_sizes(h, out_h)
+    gather_h = _compute_adaptive_pooling_gather_indices(h, out_h, big_h)
+    small_w, big_w = compute_adaptive_pooling_window_sizes(w, out_w)
+    gather_w = _compute_adaptive_pooling_gather_indices(w, out_w, big_w)
+    small_d_pool = lax.reduce_window(
+        inputs,
+        -jnp.inf,
+        lax.max,
+        (1, small_d, 1, 1, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    big_d_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, big_d, 1, 1, 1), (1, 1, 1, 1, 1), "valid"
+    )
+    combined_d = jnp.concatenate([small_d_pool, big_d_pool], axis=1)
+    pooled_d = jnp.take(combined_d, gather_d, axis=1)
+    small_h_pool = lax.reduce_window(
+        pooled_d,
+        -jnp.inf,
+        lax.max,
+        (1, 1, small_h, 1, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    big_h_pool = lax.reduce_window(
+        pooled_d,
+        -jnp.inf,
+        lax.max,
+        (1, 1, big_h, 1, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    combined_h = jnp.concatenate([small_h_pool, big_h_pool], axis=2)
+    pooled_h = jnp.take(combined_h, gather_h, axis=2)
+    small_w_pool = lax.reduce_window(
+        pooled_h,
+        -jnp.inf,
+        lax.max,
+        (1, 1, 1, small_w, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    big_w_pool = lax.reduce_window(
+        pooled_h,
+        -jnp.inf,
+        lax.max,
+        (1, 1, 1, big_w, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    combined_w = jnp.concatenate([small_w_pool, big_w_pool], axis=3)
+    out = jnp.take(combined_w, gather_w, axis=3)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 4, 1, 2, 3))
+    return out
+def adaptive_average_pool(inputs, output_size, data_format=None):
+    data_format = backend.standardize_data_format(data_format)
+    dims = inputs.ndim - 2
+    if dims == 1:
+        return _adaptive_average_pool1d(inputs, output_size, data_format)
+    if dims == 2:
+        return _adaptive_average_pool2d(inputs, output_size, data_format)
+    if dims == 3:
+        return _adaptive_average_pool3d(inputs, output_size, data_format)
+    raise ValueError("adaptive_average_pool supports only 1D/2D/3D inputs")
+def adaptive_max_pool(inputs, output_size, data_format=None):
+    data_format = backend.standardize_data_format(data_format)
+    dims = inputs.ndim - 2
+    if dims == 1:
+        return _adaptive_max_pool1d(inputs, output_size, data_format)
+    if dims == 2:
+        return _adaptive_max_pool2d(inputs, output_size, data_format)
+    if dims == 3:
+        return _adaptive_max_pool3d(inputs, output_size, data_format)
+    raise ValueError("adaptive_max_pool supports only 1D/2D/3D inputs")
 def _convert_to_lax_conv_dimension_numbers(
     num_spatial_dims,
     data_format="channels_last",
@@ -355,7 +755,7 @@ def conv(
     feature_group_count = channels // kernel_in_channels
     kernel = convert_to_tensor(kernel)
     inputs = convert_to_tensor(inputs, dtype=kernel.dtype)
-    return jax.lax.conv_general_dilated(
+    result = jax.lax.conv_general_dilated(
         inputs,
         kernel,
         strides,
@@ -364,6 +764,14 @@ def conv(
         dimension_numbers=dimension_numbers,
         feature_group_count=feature_group_count,
     )
+    if result.size == 0:
+        raise ValueError(
+            "The convolution operation resulted in an empty output. "
+            "This can happen if the input is too small for the given "
+            "kernel size, strides, dilation rate, and padding mode. "
+            "Please check the input shape and convolution parameters."
+        )
+    return result
 def depthwise_conv(
@@ -501,7 +909,7 @@ def one_hot(x, num_classes, axis=-1, dtype=None, sparse=False):
         values = jnp.greater_equal(jnp.ravel(x), 0).astype(dtype)
         values_count = values.shape[0]
         indices = [jnp.arange(dim) for dim in x.shape]
-        indices = jnp.meshgrid(*indices, indexing="ij")
+        indices = list(jnp.meshgrid(*indices, indexing="ij"))
         indices.insert(axis, jnp.maximum(x, 0))  # Deal with negative indices
         indices = [a.reshape(values_count, 1).astype("int32") for a in indices]
         indices = jnp.concatenate(indices, axis=1)
@@ -1063,25 +1471,42 @@ def _can_use_flash_attention(query, key, value, bias, raise_error=False):
         # Only support at least Ampere
         if not check_compute_capability("8.0"):
             raise RuntimeError("Require at least Ampere arch to run")
-        # Check inputs layout
+        # Inspect inputs of `check_layout`
         check_layout_params = list(
             inspect.signature(check_layout).parameters.keys()
         )
         for known_param in ("query", "key", "value", "bias", "layout"):
             check_layout_params.remove(known_param)
         # Defaults to `None` when not specified.
-        kwargs = {key: None for key in check_layout_params}
+        check_layout_kwargs = {key: None for key in check_layout_params}
         check_layout(
-            query, key, value, bias, layout=_normalize_layout("BTNH"), **kwargs
-        )
-        check_is_flash_attention(
             query,
             key,
-            _normalize_layout("BTNH"),
-            cudnn_version,
-            bias is not None,
-            is_training=False,
+            value,
+            bias,
+            layout=_normalize_layout("BTNH"),
+            **check_layout_kwargs,
         )
+        # Inspect inputs of `check_is_flash_attention`
+        check_is_flash_attention_params = inspect.signature(
+            check_is_flash_attention
+        ).parameters
+        check_is_flash_attention_kwargs = {
+            "query": query,
+            "key": key,
+            "value": value,
+            "layout": _normalize_layout("BTNH"),
+            "cudnn_version": cudnn_version,
+            "has_bias": bias is not None,
+            "is_training": False,
+        }
+        # Remove unsupported arguments
+        for param in list(check_is_flash_attention_kwargs.keys()):
+            if param not in check_is_flash_attention_params:
+                check_is_flash_attention_kwargs.pop(param)
+        check_is_flash_attention(**check_is_flash_attention_kwargs)
         return True
     except:
         if raise_error:
@@ -1332,25 +1757,32 @@ def dot_product_attention(
         if custom_mask is None and is_causal:
             custom_mask = jnp.tril(jnp.ones((q_len, q_len), dtype=jnp.bool_))
-        try:
-            output = wrap_flash_attention(
-                query_tpu_layout,
-                key_tpu_layout,
-                value_tpu_layout,
-                decoder_segment_ids=decoder_segment_ids,
-                custom_mask=custom_mask,
-                attn_logits_soft_cap=attn_logits_soft_cap,
-                head_shards=head_shards,
-                q_seq_shards=q_seq_shards,
-            )
-            # Transpose output back to Keras layout
-            return jnp.transpose(output, axes=(0, 2, 1, 3))
-        except Exception:
-            logging.exception(
-                "Failed to apply Splash kernel for flash attention. "
-                "Falling back to JAX native dot_product_attention."
-            )
+        # Splash attention kernel requires concrete mask values for hashing.
+        # If the mask is a tracer (e.g. inside a scan/loop), we must fall back.
+        if isinstance(mask, jax.core.Tracer) or isinstance(
+            custom_mask, jax.core.Tracer
+        ):
             flash_attention = False
+        else:
+            try:
+                output = wrap_flash_attention(
+                    query_tpu_layout,
+                    key_tpu_layout,
+                    value_tpu_layout,
+                    decoder_segment_ids=decoder_segment_ids,
+                    custom_mask=custom_mask,
+                    attn_logits_soft_cap=attn_logits_soft_cap,
+                    head_shards=head_shards,
+                    q_seq_shards=q_seq_shards,
+                )
+                # Transpose output back to Keras layout
+                return jnp.transpose(output, axes=(0, 2, 1, 3))
+            except Exception:
+                logging.exception(
+                    "Failed to apply Splash kernel for flash attention. "
+                    "Falling back to JAX native dot_product_attention."
+                )
+                flash_attention = False
     # JAX native dot_product_attention for GPU or fallback for TPU
     if hasattr(jax.nn, "dot_product_attention"):
@@ -1396,6 +1828,11 @@ def dot_product_attention(
     def _reshape_to_grouped(t):
         if t is not None:
+            while t.ndim < 4:
+                if t.ndim == 3 and t.shape[1] == N:
+                    t = jnp.expand_dims(t, axis=2)
+                else:
+                    t = jnp.expand_dims(t, axis=1)
             tB, tN, tT, tS = t.shape
             if tN == 1:
                 t = jnp.broadcast_to(t[:, :, None, :, :], (tB, tN, G, tT, tS))
@@ -1413,3 +1850,46 @@ def dot_product_attention(
     )
     encoded = vmapped_fn(query, key, value, bias, mask, is_causal, scale)
     return jnp.reshape(encoded, output_shape)
+def unfold(input, kernel_size, dilation=1, padding=0, stride=1):
+    """JAX implementation of Unfold.
+    Extract sliding local blocks from a **NCHW** batched image tensor.
+    Args:
+        input: 4-D tensor, shape (N, C, H, W)  **required**.
+        kernel_size: int or (kH, kW)
+        dilation: int or (dH, dW), default 1
+        padding: int or (pH, pW), default 0
+        stride: int or (sH, sW), default 1
+    Returns:
+        3-D tensor, shape (N, C*kH*kW, L)
+    """
+    def _pair(x):
+        return (x, x) if isinstance(x, int) else x
+    k = _pair(kernel_size)
+    d = _pair(dilation)
+    p = _pair(padding)
+    s = _pair(stride)
+    N, C, H, W = input.shape
+    # ---- padding ----
+    if any(_ > 0 for _ in p):
+        input = jnp.pad(input, ((0, 0), (0, 0), (p[0], p[0]), (p[1], p[1])))
+    patches = lax.conv_general_dilated_patches(
+        input,
+        filter_shape=k,
+        window_strides=s,
+        padding="VALID",  # has padde
+        rhs_dilation=d,
+        dimension_numbers=("NCHW", "OIHW", "NCHW"),  # only support 'NCHW'
+    )  # shape: (N, C*kH*kW, oH, oW)
+    # ---- reshape -> (N, C*kH*kW, L) ----
+    _, CKK, oH, oW = patches.shape
+    return patches.reshape(N, CKK, oH * oW)

keras-nightly 3.12.0.dev2025100503__py3-none-any.whl → 3.14.0.dev2026011604__py3-none-any.whl

keras-nightly 3.12.0.dev2025100503py3-none-any.whl → 3.14.0.dev2026011604py3-none-any.whl