PyPI - keras-nightly - Versions diffs - 3.12.0.dev2025092403__py3-none-any.whl → 3.14.0.dev2026010104__py3-none-any.whl - Mend

keras-nightly 3.12.0.dev2025092403py3-none-any.whl → 3.14.0.dev2026010104py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (133) hide show

keras/__init__.py +1 -0
keras/_tf_keras/keras/__init__.py +1 -0
keras/_tf_keras/keras/callbacks/__init__.py +3 -0
keras/_tf_keras/keras/distillation/__init__.py +16 -0
keras/_tf_keras/keras/distribution/__init__.py +3 -0
keras/_tf_keras/keras/layers/__init__.py +21 -0
keras/_tf_keras/keras/ops/__init__.py +13 -0
keras/_tf_keras/keras/ops/image/__init__.py +1 -0
keras/_tf_keras/keras/ops/linalg/__init__.py +1 -0
keras/_tf_keras/keras/ops/nn/__init__.py +3 -0
keras/_tf_keras/keras/ops/numpy/__init__.py +9 -0
keras/_tf_keras/keras/quantizers/__init__.py +12 -0
keras/callbacks/__init__.py +3 -0
keras/distillation/__init__.py +16 -0
keras/distribution/__init__.py +3 -0
keras/layers/__init__.py +21 -0
keras/ops/__init__.py +13 -0
keras/ops/image/__init__.py +1 -0
keras/ops/linalg/__init__.py +1 -0
keras/ops/nn/__init__.py +3 -0
keras/ops/numpy/__init__.py +9 -0
keras/quantizers/__init__.py +12 -0
keras/src/applications/imagenet_utils.py +4 -1
keras/src/backend/common/backend_utils.py +30 -6
keras/src/backend/common/dtypes.py +1 -1
keras/src/backend/common/name_scope.py +2 -1
keras/src/backend/common/variables.py +33 -16
keras/src/backend/jax/core.py +92 -3
keras/src/backend/jax/distribution_lib.py +16 -2
keras/src/backend/jax/linalg.py +4 -0
keras/src/backend/jax/nn.py +485 -20
keras/src/backend/jax/numpy.py +92 -23
keras/src/backend/jax/optimizer.py +3 -2
keras/src/backend/jax/trainer.py +14 -2
keras/src/backend/numpy/linalg.py +4 -0
keras/src/backend/numpy/nn.py +313 -2
keras/src/backend/numpy/numpy.py +76 -7
keras/src/backend/openvino/__init__.py +1 -0
keras/src/backend/openvino/core.py +2 -23
keras/src/backend/openvino/linalg.py +4 -0
keras/src/backend/openvino/nn.py +271 -20
keras/src/backend/openvino/numpy.py +1030 -185
keras/src/backend/openvino/random.py +7 -14
keras/src/backend/tensorflow/layer.py +43 -9
keras/src/backend/tensorflow/linalg.py +24 -0
keras/src/backend/tensorflow/nn.py +545 -1
keras/src/backend/tensorflow/numpy.py +264 -54
keras/src/backend/torch/core.py +3 -1
keras/src/backend/torch/linalg.py +4 -0
keras/src/backend/torch/nn.py +125 -0
keras/src/backend/torch/numpy.py +84 -8
keras/src/callbacks/__init__.py +1 -0
keras/src/callbacks/callback_list.py +45 -11
keras/src/callbacks/model_checkpoint.py +5 -0
keras/src/callbacks/orbax_checkpoint.py +299 -0
keras/src/callbacks/terminate_on_nan.py +54 -5
keras/src/datasets/cifar10.py +5 -0
keras/src/distillation/__init__.py +1 -0
keras/src/distillation/distillation_loss.py +390 -0
keras/src/distillation/distiller.py +598 -0
keras/src/distribution/distribution_lib.py +14 -0
keras/src/export/__init__.py +2 -0
keras/src/export/export_utils.py +39 -2
keras/src/export/litert.py +248 -0
keras/src/export/openvino.py +1 -1
keras/src/export/tf2onnx_lib.py +3 -0
keras/src/layers/__init__.py +13 -0
keras/src/layers/activations/softmax.py +9 -4
keras/src/layers/attention/attention.py +1 -1
keras/src/layers/attention/multi_head_attention.py +4 -1
keras/src/layers/core/dense.py +191 -172
keras/src/layers/core/einsum_dense.py +235 -186
keras/src/layers/core/embedding.py +83 -93
keras/src/layers/core/input_layer.py +1 -0
keras/src/layers/core/reversible_embedding.py +390 -0
keras/src/layers/input_spec.py +17 -17
keras/src/layers/layer.py +40 -15
keras/src/layers/merging/dot.py +4 -1
keras/src/layers/pooling/adaptive_average_pooling1d.py +65 -0
keras/src/layers/pooling/adaptive_average_pooling2d.py +62 -0
keras/src/layers/pooling/adaptive_average_pooling3d.py +63 -0
keras/src/layers/pooling/adaptive_max_pooling1d.py +65 -0
keras/src/layers/pooling/adaptive_max_pooling2d.py +62 -0
keras/src/layers/pooling/adaptive_max_pooling3d.py +63 -0
keras/src/layers/pooling/base_adaptive_pooling.py +63 -0
keras/src/layers/preprocessing/discretization.py +6 -5
keras/src/layers/preprocessing/index_lookup.py +19 -1
keras/src/layers/preprocessing/normalization.py +16 -1
keras/src/layers/regularization/dropout.py +43 -1
keras/src/layers/rnn/gru.py +1 -1
keras/src/layers/rnn/lstm.py +2 -2
keras/src/layers/rnn/rnn.py +19 -0
keras/src/layers/rnn/simple_rnn.py +1 -1
keras/src/losses/loss.py +1 -1
keras/src/metrics/confusion_metrics.py +7 -6
keras/src/models/cloning.py +4 -0
keras/src/models/functional.py +11 -3
keras/src/models/model.py +156 -27
keras/src/ops/image.py +184 -3
keras/src/ops/linalg.py +93 -0
keras/src/ops/nn.py +268 -2
keras/src/ops/numpy.py +541 -43
keras/src/optimizers/adafactor.py +29 -10
keras/src/optimizers/base_optimizer.py +22 -3
keras/src/optimizers/loss_scale_optimizer.py +51 -18
keras/src/optimizers/muon.py +65 -31
keras/src/optimizers/schedules/learning_rate_schedule.py +4 -3
keras/src/quantizers/__init__.py +12 -1
keras/src/quantizers/gptq.py +8 -6
keras/src/quantizers/gptq_config.py +36 -1
keras/src/quantizers/gptq_core.py +150 -78
keras/src/quantizers/quantization_config.py +232 -0
keras/src/quantizers/quantizers.py +114 -38
keras/src/quantizers/utils.py +23 -0
keras/src/random/seed_generator.py +4 -2
keras/src/saving/file_editor.py +81 -6
keras/src/saving/saving_lib.py +1 -1
keras/src/testing/__init__.py +1 -0
keras/src/testing/test_case.py +45 -5
keras/src/trainers/compile_utils.py +14 -5
keras/src/utils/backend_utils.py +31 -4
keras/src/utils/dataset_utils.py +234 -35
keras/src/utils/file_utils.py +49 -11
keras/src/utils/image_utils.py +14 -2
keras/src/utils/jax_layer.py +187 -36
keras/src/utils/module_utils.py +18 -0
keras/src/utils/progbar.py +10 -12
keras/src/utils/rng_utils.py +9 -1
keras/src/version.py +1 -1
{keras_nightly-3.12.0.dev2025092403.dist-info → keras_nightly-3.14.0.dev2026010104.dist-info}/METADATA +16 -6
{keras_nightly-3.12.0.dev2025092403.dist-info → keras_nightly-3.14.0.dev2026010104.dist-info}/RECORD +133 -116
{keras_nightly-3.12.0.dev2025092403.dist-info → keras_nightly-3.14.0.dev2026010104.dist-info}/WHEEL +0 -0
{keras_nightly-3.12.0.dev2025092403.dist-info → keras_nightly-3.14.0.dev2026010104.dist-info}/top_level.txt +0 -0

keras/src/backend/jax/nn.py CHANGED Viewed

@@ -16,6 +16,9 @@ from jax.experimental.pallas.ops.tpu.splash_attention import (
 )
 from keras.src import backend
+from keras.src.backend.common.backend_utils import (
+    compute_adaptive_pooling_window_sizes,
+)
 from keras.src.backend.common.backend_utils import (
     compute_conv_transpose_padding_args_for_jax,
 )
@@ -289,6 +292,403 @@ def average_pool(
         return pooled / window_counts
+def _compute_adaptive_pooling_gather_indices(
+    input_dim, output_size, big_window
+):
+    """Compute gather indices for Two-Pool Gather method."""
+    window_starts = jnp.floor(
+        (jnp.arange(output_size) * input_dim) / output_size
+    ).astype(jnp.int32)
+    window_ends = jnp.ceil(
+        (jnp.arange(1, output_size + 1) * input_dim) / output_size
+    ).astype(jnp.int32)
+    window_sizes = window_ends - window_starts
+    is_big = window_sizes == big_window
+    small_window = big_window - 1
+    small_len = input_dim - small_window + 1
+    small_indices = window_starts
+    big_indices = window_starts + small_len
+    gather = jnp.where(is_big, big_indices, small_indices)
+    return gather.astype(jnp.int32)
+def _adaptive_average_pool1d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size,)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 1))  # NCL → NLC
+    n, l, c = inputs.shape
+    out_l = output_size[0]
+    small, big = compute_adaptive_pooling_window_sizes(l, out_l)
+    gather = _compute_adaptive_pooling_gather_indices(l, out_l, big)
+    small_pool = (
+        lax.reduce_window(
+            inputs, 0.0, lax.add, (1, small, 1), (1, 1, 1), "valid"
+        )
+        / small
+    )
+    big_pool = (
+        lax.reduce_window(inputs, 0.0, lax.add, (1, big, 1), (1, 1, 1), "valid")
+        / big
+    )
+    combined = jnp.concatenate([small_pool, big_pool], axis=1)
+    out = jnp.take(combined, gather, axis=1)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 2, 1))
+    return out
+def _adaptive_max_pool1d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size,)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 1))
+    n, l, c = inputs.shape
+    out_l = output_size[0]
+    small, big = compute_adaptive_pooling_window_sizes(l, out_l)
+    gather = _compute_adaptive_pooling_gather_indices(l, out_l, big)
+    small_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, small, 1), (1, 1, 1), "valid"
+    )
+    big_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, big, 1), (1, 1, 1), "valid"
+    )
+    combined = jnp.concatenate([small_pool, big_pool], axis=1)
+    out = jnp.take(combined, gather, axis=1)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 2, 1))
+    return out
+def _adaptive_average_pool2d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 3, 1))
+    n, h, w, c = inputs.shape
+    out_h, out_w = output_size
+    small_h, big_h = compute_adaptive_pooling_window_sizes(h, out_h)
+    gather_h = _compute_adaptive_pooling_gather_indices(h, out_h, big_h)
+    small_w, big_w = compute_adaptive_pooling_window_sizes(w, out_w)
+    gather_w = _compute_adaptive_pooling_gather_indices(w, out_w, big_w)
+    small_h_pool = (
+        lax.reduce_window(
+            inputs, 0.0, lax.add, (1, small_h, 1, 1), (1, 1, 1, 1), "valid"
+        )
+        / small_h
+    )
+    big_h_pool = (
+        lax.reduce_window(
+            inputs, 0.0, lax.add, (1, big_h, 1, 1), (1, 1, 1, 1), "valid"
+        )
+        / big_h
+    )
+    combined_h = jnp.concatenate([small_h_pool, big_h_pool], axis=1)
+    pooled_h = jnp.take(combined_h, gather_h, axis=1)
+    small_w_pool = (
+        lax.reduce_window(
+            pooled_h, 0.0, lax.add, (1, 1, small_w, 1), (1, 1, 1, 1), "valid"
+        )
+        / small_w
+    )
+    big_w_pool = (
+        lax.reduce_window(
+            pooled_h, 0.0, lax.add, (1, 1, big_w, 1), (1, 1, 1, 1), "valid"
+        )
+        / big_w
+    )
+    combined_w = jnp.concatenate([small_w_pool, big_w_pool], axis=2)
+    out = jnp.take(combined_w, gather_w, axis=2)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 3, 1, 2))
+    return out
+def _adaptive_max_pool2d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 3, 1))
+    n, h, w, c = inputs.shape
+    out_h, out_w = output_size
+    small_h, big_h = compute_adaptive_pooling_window_sizes(h, out_h)
+    gather_h = _compute_adaptive_pooling_gather_indices(h, out_h, big_h)
+    small_w, big_w = compute_adaptive_pooling_window_sizes(w, out_w)
+    gather_w = _compute_adaptive_pooling_gather_indices(w, out_w, big_w)
+    small_h_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, small_h, 1, 1), (1, 1, 1, 1), "valid"
+    )
+    big_h_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, big_h, 1, 1), (1, 1, 1, 1), "valid"
+    )
+    combined_h = jnp.concatenate([small_h_pool, big_h_pool], axis=1)
+    pooled_h = jnp.take(combined_h, gather_h, axis=1)
+    small_w_pool = lax.reduce_window(
+        pooled_h, -jnp.inf, lax.max, (1, 1, small_w, 1), (1, 1, 1, 1), "valid"
+    )
+    big_w_pool = lax.reduce_window(
+        pooled_h, -jnp.inf, lax.max, (1, 1, big_w, 1), (1, 1, 1, 1), "valid"
+    )
+    combined_w = jnp.concatenate([small_w_pool, big_w_pool], axis=2)
+    out = jnp.take(combined_w, gather_w, axis=2)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 3, 1, 2))
+    return out
+def _adaptive_average_pool3d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size, output_size)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 3, 4, 1))
+    n, d, h, w, c = inputs.shape
+    out_d, out_h, out_w = output_size
+    small_d, big_d = compute_adaptive_pooling_window_sizes(d, out_d)
+    gather_d = _compute_adaptive_pooling_gather_indices(d, out_d, big_d)
+    small_h, big_h = compute_adaptive_pooling_window_sizes(h, out_h)
+    gather_h = _compute_adaptive_pooling_gather_indices(h, out_h, big_h)
+    small_w, big_w = compute_adaptive_pooling_window_sizes(w, out_w)
+    gather_w = _compute_adaptive_pooling_gather_indices(w, out_w, big_w)
+    small_d_pool = (
+        lax.reduce_window(
+            inputs,
+            0.0,
+            lax.add,
+            (1, small_d, 1, 1, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / small_d
+    )
+    big_d_pool = (
+        lax.reduce_window(
+            inputs, 0.0, lax.add, (1, big_d, 1, 1, 1), (1, 1, 1, 1, 1), "valid"
+        )
+        / big_d
+    )
+    combined_d = jnp.concatenate([small_d_pool, big_d_pool], axis=1)
+    pooled_d = jnp.take(combined_d, gather_d, axis=1)
+    small_h_pool = (
+        lax.reduce_window(
+            pooled_d,
+            0.0,
+            lax.add,
+            (1, 1, small_h, 1, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / small_h
+    )
+    big_h_pool = (
+        lax.reduce_window(
+            pooled_d,
+            0.0,
+            lax.add,
+            (1, 1, big_h, 1, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / big_h
+    )
+    combined_h = jnp.concatenate([small_h_pool, big_h_pool], axis=2)
+    pooled_h = jnp.take(combined_h, gather_h, axis=2)
+    small_w_pool = (
+        lax.reduce_window(
+            pooled_h,
+            0.0,
+            lax.add,
+            (1, 1, 1, small_w, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / small_w
+    )
+    big_w_pool = (
+        lax.reduce_window(
+            pooled_h,
+            0.0,
+            lax.add,
+            (1, 1, 1, big_w, 1),
+            (1, 1, 1, 1, 1),
+            "valid",
+        )
+        / big_w
+    )
+    combined_w = jnp.concatenate([small_w_pool, big_w_pool], axis=3)
+    out = jnp.take(combined_w, gather_w, axis=3)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 4, 1, 2, 3))
+    return out
+def _adaptive_max_pool3d(inputs, output_size, data_format="channels_first"):
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size, output_size)
+    if data_format == "channels_first":
+        inputs = jnp.transpose(inputs, (0, 2, 3, 4, 1))
+    n, d, h, w, c = inputs.shape
+    out_d, out_h, out_w = output_size
+    small_d, big_d = compute_adaptive_pooling_window_sizes(d, out_d)
+    gather_d = _compute_adaptive_pooling_gather_indices(d, out_d, big_d)
+    small_h, big_h = compute_adaptive_pooling_window_sizes(h, out_h)
+    gather_h = _compute_adaptive_pooling_gather_indices(h, out_h, big_h)
+    small_w, big_w = compute_adaptive_pooling_window_sizes(w, out_w)
+    gather_w = _compute_adaptive_pooling_gather_indices(w, out_w, big_w)
+    small_d_pool = lax.reduce_window(
+        inputs,
+        -jnp.inf,
+        lax.max,
+        (1, small_d, 1, 1, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    big_d_pool = lax.reduce_window(
+        inputs, -jnp.inf, lax.max, (1, big_d, 1, 1, 1), (1, 1, 1, 1, 1), "valid"
+    )
+    combined_d = jnp.concatenate([small_d_pool, big_d_pool], axis=1)
+    pooled_d = jnp.take(combined_d, gather_d, axis=1)
+    small_h_pool = lax.reduce_window(
+        pooled_d,
+        -jnp.inf,
+        lax.max,
+        (1, 1, small_h, 1, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    big_h_pool = lax.reduce_window(
+        pooled_d,
+        -jnp.inf,
+        lax.max,
+        (1, 1, big_h, 1, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    combined_h = jnp.concatenate([small_h_pool, big_h_pool], axis=2)
+    pooled_h = jnp.take(combined_h, gather_h, axis=2)
+    small_w_pool = lax.reduce_window(
+        pooled_h,
+        -jnp.inf,
+        lax.max,
+        (1, 1, 1, small_w, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    big_w_pool = lax.reduce_window(
+        pooled_h,
+        -jnp.inf,
+        lax.max,
+        (1, 1, 1, big_w, 1),
+        (1, 1, 1, 1, 1),
+        "valid",
+    )
+    combined_w = jnp.concatenate([small_w_pool, big_w_pool], axis=3)
+    out = jnp.take(combined_w, gather_w, axis=3)
+    if data_format == "channels_first":
+        out = jnp.transpose(out, (0, 4, 1, 2, 3))
+    return out
+def adaptive_average_pool(inputs, output_size, data_format=None):
+    data_format = backend.standardize_data_format(data_format)
+    dims = inputs.ndim - 2
+    if dims == 1:
+        return _adaptive_average_pool1d(inputs, output_size, data_format)
+    if dims == 2:
+        return _adaptive_average_pool2d(inputs, output_size, data_format)
+    if dims == 3:
+        return _adaptive_average_pool3d(inputs, output_size, data_format)
+    raise ValueError("adaptive_average_pool supports only 1D/2D/3D inputs")
+def adaptive_max_pool(inputs, output_size, data_format=None):
+    data_format = backend.standardize_data_format(data_format)
+    dims = inputs.ndim - 2
+    if dims == 1:
+        return _adaptive_max_pool1d(inputs, output_size, data_format)
+    if dims == 2:
+        return _adaptive_max_pool2d(inputs, output_size, data_format)
+    if dims == 3:
+        return _adaptive_max_pool3d(inputs, output_size, data_format)
+    raise ValueError("adaptive_max_pool supports only 1D/2D/3D inputs")
 def _convert_to_lax_conv_dimension_numbers(
     num_spatial_dims,
     data_format="channels_last",
@@ -355,7 +755,7 @@ def conv(
     feature_group_count = channels // kernel_in_channels
     kernel = convert_to_tensor(kernel)
     inputs = convert_to_tensor(inputs, dtype=kernel.dtype)
-    return jax.lax.conv_general_dilated(
+    result = jax.lax.conv_general_dilated(
         inputs,
         kernel,
         strides,
@@ -364,6 +764,14 @@ def conv(
         dimension_numbers=dimension_numbers,
         feature_group_count=feature_group_count,
     )
+    if result.size == 0:
+        raise ValueError(
+            "The convolution operation resulted in an empty output. "
+            "This can happen if the input is too small for the given "
+            "kernel size, strides, dilation rate, and padding mode. "
+            "Please check the input shape and convolution parameters."
+        )
+    return result
 def depthwise_conv(
@@ -396,6 +804,8 @@ def depthwise_conv(
     feature_group_count = (
         inputs.shape[-1] if data_format == "channels_last" else inputs.shape[1]
     )
+    kernel = convert_to_tensor(kernel)
+    inputs = convert_to_tensor(inputs)
     kernel = jnp.reshape(
         kernel,
         kernel.shape[:-2] + (1, feature_group_count * kernel.shape[-1]),
@@ -499,7 +909,7 @@ def one_hot(x, num_classes, axis=-1, dtype=None, sparse=False):
         values = jnp.greater_equal(jnp.ravel(x), 0).astype(dtype)
         values_count = values.shape[0]
         indices = [jnp.arange(dim) for dim in x.shape]
-        indices = jnp.meshgrid(*indices, indexing="ij")
+        indices = list(jnp.meshgrid(*indices, indexing="ij"))
         indices.insert(axis, jnp.maximum(x, 0))  # Deal with negative indices
         indices = [a.reshape(values_count, 1).astype("int32") for a in indices]
         indices = jnp.concatenate(indices, axis=1)
@@ -1330,25 +1740,32 @@ def dot_product_attention(
         if custom_mask is None and is_causal:
             custom_mask = jnp.tril(jnp.ones((q_len, q_len), dtype=jnp.bool_))
-        try:
-            output = wrap_flash_attention(
-                query_tpu_layout,
-                key_tpu_layout,
-                value_tpu_layout,
-                decoder_segment_ids=decoder_segment_ids,
-                custom_mask=custom_mask,
-                attn_logits_soft_cap=attn_logits_soft_cap,
-                head_shards=head_shards,
-                q_seq_shards=q_seq_shards,
-            )
-            # Transpose output back to Keras layout
-            return jnp.transpose(output, axes=(0, 2, 1, 3))
-        except Exception:
-            logging.exception(
-                "Failed to apply Splash kernel for flash attention. "
-                "Falling back to JAX native dot_product_attention."
-            )
+        # Splash attention kernel requires concrete mask values for hashing.
+        # If the mask is a tracer (e.g. inside a scan/loop), we must fall back.
+        if isinstance(mask, jax.core.Tracer) or isinstance(
+            custom_mask, jax.core.Tracer
+        ):
             flash_attention = False
+        else:
+            try:
+                output = wrap_flash_attention(
+                    query_tpu_layout,
+                    key_tpu_layout,
+                    value_tpu_layout,
+                    decoder_segment_ids=decoder_segment_ids,
+                    custom_mask=custom_mask,
+                    attn_logits_soft_cap=attn_logits_soft_cap,
+                    head_shards=head_shards,
+                    q_seq_shards=q_seq_shards,
+                )
+                # Transpose output back to Keras layout
+                return jnp.transpose(output, axes=(0, 2, 1, 3))
+            except Exception:
+                logging.exception(
+                    "Failed to apply Splash kernel for flash attention. "
+                    "Falling back to JAX native dot_product_attention."
+                )
+                flash_attention = False
     # JAX native dot_product_attention for GPU or fallback for TPU
     if hasattr(jax.nn, "dot_product_attention"):
@@ -1394,6 +1811,11 @@ def dot_product_attention(
     def _reshape_to_grouped(t):
         if t is not None:
+            while t.ndim < 4:
+                if t.ndim == 3 and t.shape[1] == N:
+                    t = jnp.expand_dims(t, axis=2)
+                else:
+                    t = jnp.expand_dims(t, axis=1)
             tB, tN, tT, tS = t.shape
             if tN == 1:
                 t = jnp.broadcast_to(t[:, :, None, :, :], (tB, tN, G, tT, tS))
@@ -1411,3 +1833,46 @@ def dot_product_attention(
     )
     encoded = vmapped_fn(query, key, value, bias, mask, is_causal, scale)
     return jnp.reshape(encoded, output_shape)
+def unfold(input, kernel_size, dilation=1, padding=0, stride=1):
+    """JAX implementation of Unfold.
+    Extract sliding local blocks from a **NCHW** batched image tensor.
+    Args:
+        input: 4-D tensor, shape (N, C, H, W)  **required**.
+        kernel_size: int or (kH, kW)
+        dilation: int or (dH, dW), default 1
+        padding: int or (pH, pW), default 0
+        stride: int or (sH, sW), default 1
+    Returns:
+        3-D tensor, shape (N, C*kH*kW, L)
+    """
+    def _pair(x):
+        return (x, x) if isinstance(x, int) else x
+    k = _pair(kernel_size)
+    d = _pair(dilation)
+    p = _pair(padding)
+    s = _pair(stride)
+    N, C, H, W = input.shape
+    # ---- padding ----
+    if any(_ > 0 for _ in p):
+        input = jnp.pad(input, ((0, 0), (0, 0), (p[0], p[0]), (p[1], p[1])))
+    patches = lax.conv_general_dilated_patches(
+        input,
+        filter_shape=k,
+        window_strides=s,
+        padding="VALID",  # has padde
+        rhs_dilation=d,
+        dimension_numbers=("NCHW", "OIHW", "NCHW"),  # only support 'NCHW'
+    )  # shape: (N, C*kH*kW, oH, oW)
+    # ---- reshape -> (N, C*kH*kW, L) ----
+    _, CKK, oH, oW = patches.shape
+    return patches.reshape(N, CKK, oH * oW)

keras-nightly 3.12.0.dev2025092403__py3-none-any.whl → 3.14.0.dev2026010104__py3-none-any.whl

keras-nightly 3.12.0.dev2025092403py3-none-any.whl → 3.14.0.dev2026010104py3-none-any.whl