PyPI - tpu-inference - Versions diffs - 0.11.1.dev202512030818__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (250) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +78 -1
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +38 -7
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +17 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +95 -78
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +28 -5
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +278 -209
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +74 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +88 -25
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -3
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -64
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +72 -37
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +45 -15
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +41 -16
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +42 -36
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +63 -50
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/METADATA +11 -9
tpu_inference-0.13.0rc2.post7.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202512030818.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/top_level.txt +0 -0

tpu_inference/layers/jax/attention/deepseek_v3_attention.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from dataclasses import InitVar, dataclass
 from typing import Any, Tuple
@@ -6,14 +20,18 @@ import jax
 import jax.numpy as jnp
 from flax import nnx
 from flax.typing import Sharding
-from jax.experimental import shard_map
 from jax.sharding import Mesh
 from jax.sharding import PartitionSpec as P
 from tpu_inference import utils
+from tpu_inference.kernels.mla.v1.kernel import mla_ragged_paged_attention
 from tpu_inference.kernels.ragged_paged_attention.v3.kernel import \
     ragged_paged_attention
+from tpu_inference.kernels.ragged_paged_attention.v3.tuned_block_sizes import \
+    get_tuned_block_sizes
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.quantization import quantize_kv
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.base import create_param
 from tpu_inference.layers.jax.layers import RMSNorm
 from tpu_inference.layers.jax.rope import DeepseekScalingRotaryEmbedding
@@ -48,8 +66,9 @@ class MLA(nnx.Module):
     rms_norm_eps: float
     # Sharding attributes
-    nhd_sharding: Sharding = ()
+    rd_sharding: Sharding = ()
     q_da_sharding: Sharding = ()
+    ap_sharding: Sharding = ()
     anh_sharding: Sharding = ()
     kv_da_sharding: Sharding = ()
@@ -66,6 +85,7 @@ class MLA(nnx.Module):
     rope_input_ordering: str = "split"
     quant: Any | None = None
     rope_mscale_all_dim: float = 1.0
+    use_mla_kernel: bool = False
     rngs: InitVar[nnx.Rngs]
@@ -77,10 +97,10 @@ class MLA(nnx.Module):
         self.N = self.num_attention_heads
         self.K = self.num_key_value_heads
         self.D = self.hidden_size
         self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
-        assert self.N == self.K, "N and K must be equal for MLA"
+        if not self.use_mla_kernel:
+            assert self.N == self.K, "N and K must be equal for MLA"
         if self.rope_scaling["factor"] <= 1.0:
             yarn_mscale = 1.0
@@ -108,10 +128,10 @@ class MLA(nnx.Module):
                                                   self.q_da_sharding,
                                                   self.dtype,
                                                   random_init=self.random_init)
-        self.kernel_q_up_proj_ANH = create_param(
+        self.kernel_q_up_proj_AP = create_param(
             rngs,
-            (self.q_lora_rank, self.N, self.qk_head_dim),
-            self.anh_sharding,
+            (self.q_lora_rank, self.N * self.qk_head_dim),
+            self.ap_sharding,
             self.dtype,
             random_init=self.random_init,
         )
@@ -122,17 +142,38 @@ class MLA(nnx.Module):
             self.dtype,
             random_init=self.random_init,
         )
-        self.kernel_kv_up_proj_ANH = create_param(
-            rngs,
-            (self.kv_lora_rank, self.N,
-             self.qk_nope_head_dim + self.v_head_dim),
-            self.anh_sharding,
-            self.dtype,
-            random_init=self.random_init,
-        )
-        self.kernel_o_proj_NHD = create_param(
-            rngs, (self.N, self.v_head_dim, self.D),
-            self.nhd_sharding,
+        # NOTE (jacobplatin): we are keeping these variables as 3D because
+        # we would need to reshape them before the below projection,
+        # which caused issues as Qwix wasn't quantizing it correctly
+        # on the abstract pass
+        if self.use_mla_kernel:
+            self.kernel_k_up_proj_ANH = create_param(
+                rngs,
+                (self.kv_lora_rank, self.N, self.qk_nope_head_dim),
+                self.anh_sharding,
+                self.dtype,
+                random_init=self.random_init,
+            )
+            self.kernel_v_up_proj_ANH = create_param(
+                rngs,
+                (self.kv_lora_rank, self.N, self.v_head_dim),
+                self.anh_sharding,
+                self.dtype,
+                random_init=self.random_init,
+            )
+        else:
+            self.kernel_kv_up_proj_AL = create_param(
+                rngs,
+                (self.kv_lora_rank, self.N *
+                 (self.qk_nope_head_dim + self.v_head_dim)),
+                self.
+                ap_sharding,  # NOTE: we use the same sharding for kv_up_proj_AL and kernel_q_up_proj_AP
+                self.dtype,
+                random_init=self.random_init,
+            )
+        self.kernel_o_proj_RD = create_param(
+            rngs, (self.N * self.v_head_dim, self.D),
+            self.rd_sharding,
             self.dtype,
             random_init=self.random_init)
         self.q_rms_norm = RMSNorm(
@@ -188,17 +229,24 @@ class MLA(nnx.Module):
             q_TA = jnp.einsum("TD,DA -> TA", x_q_TD,
                               self.kernel_q_down_proj_DA.value)
             q_TA = self.q_rms_norm(q_TA)
-            # Query up projection.
-            q_TNH = jnp.einsum("TA,ANH -> TNH", q_TA,
-                               self.kernel_q_up_proj_ANH.value)
+            # Query up projection, then reshape to TNH.
+            q_TP = jnp.einsum("TA,AP -> TP", q_TA,
+                              self.kernel_q_up_proj_AP.value)
+            q_TNH = q_TP.reshape(q_TA.shape[0], self.N, self.qk_head_dim)
             # Split the query into nope and rope.
             q_nope_TNH = q_TNH[..., :self.qk_nope_head_dim]
             q_rope_TNH = q_TNH[..., self.qk_nope_head_dim:]
             q_rope_TNH = self.rope.apply_rope(md.input_positions, q_rope_TNH)
-            # Concatenate the nope and rope queries.
-            q_TNH = jnp.concatenate([q_nope_TNH, q_rope_TNH], axis=-1)
-            # Multiple the query by scaling factor
-            q_TNH = nnx.with_sharding_constraint(q_TNH, self.query_tnh)
+            if self.use_mla_kernel:
+                # Absorb the k up-projection matrix into q
+                q_TNA = jnp.einsum("TNH,ANH -> TNA", q_nope_TNH,
+                                   self.kernel_k_up_proj_ANH.value)
+                q_TNA = nnx.with_sharding_constraint(q_TNA, self.query_tnh)
+            else:
+                # Concatenate the nope and rope queries.
+                q_TNH = jnp.concatenate([q_nope_TNH, q_rope_TNH], axis=-1)
+                # Multiply the query by scaling factor
+                q_TNH = nnx.with_sharding_constraint(q_TNH, self.query_tnh)
         with jax.named_scope("kv_proj"):
             # KV down projection.
@@ -209,21 +257,30 @@ class MLA(nnx.Module):
             # Reshape k_rope_BSH to include head dimension for RoPE application
             k_rope_SNH = k_rope_SH[..., None, :]
             k_rope_SNH = self.rope.apply_rope(md.input_positions, k_rope_SNH)
-            k_rope_SNH = jnp.broadcast_to(
-                k_rope_SNH,
-                (k_rope_SNH.shape[0], self.N, self.qk_rope_head_dim))
+            assert k_rope_SNH.shape[1] == 1
+            k_rope_SH = k_rope_SNH[:, 0, :]
             kv_SA = kv_SA[..., :self.kv_lora_rank]
             kv_SA = self.kv_rms_norm(kv_SA)
-            # KV up projection.
-            kv_nope_SNH = jnp.einsum("SA,ANH -> SNH", kv_SA,
-                                     self.kernel_kv_up_proj_ANH.value)
-            # Split the latent kv vector into k nope vector and v vector.
-            k_nope_SNH = kv_nope_SNH[..., :self.qk_nope_head_dim]
-            v_SNH = kv_nope_SNH[..., self.qk_nope_head_dim:]
-            # Concatenate the key vector.
-            k_SNH = jnp.concatenate([k_nope_SNH, k_rope_SNH], axis=-1)
-            k_SNH = nnx.with_sharding_constraint(k_SNH, self.keyvalue_skh)
-            v_SNH = nnx.with_sharding_constraint(v_SNH, self.keyvalue_skh)
+            kv_SA = nnx.with_sharding_constraint(kv_SA, self.keyvalue_skh)
+            if not self.use_mla_kernel:
+                k_rope_SNH = jnp.broadcast_to(
+                    k_rope_SNH,
+                    (k_rope_SNH.shape[0], self.N, self.qk_rope_head_dim))
+                # KV up projection, then reshape to SN(Hk+Hv).
+                kv_SL = jnp.einsum("SA,AL -> SL", kv_SA,
+                                   self.kernel_kv_up_proj_AL.value)
+                kv_nope_SNH = kv_SL.reshape(
+                    kv_SA.shape[0], self.N,
+                    self.qk_nope_head_dim + self.v_head_dim)
+                # Split the latent kv vector into k nope vector and v vector.
+                k_nope_SNH = kv_nope_SNH[..., :self.qk_nope_head_dim]
+                v_SNH = kv_nope_SNH[..., self.qk_nope_head_dim:]
+                # Concatenate the key vector.
+                k_SNH = jnp.concatenate([k_nope_SNH, k_rope_SNH], axis=-1)
+                k_SNH = nnx.with_sharding_constraint(k_SNH, self.keyvalue_skh)
+                v_SNH = nnx.with_sharding_constraint(v_SNH, self.keyvalue_skh)
         with jax.named_scope("attn_op"):
             # TODO(wenxindongwork): K and V have different head dimension,
@@ -234,44 +291,67 @@ class MLA(nnx.Module):
             # q, k, v head dimension to be multiple of 128. For now, we will
             # pad the q, k, v dimension to multiple of 128.
             # We should update the MLA kv cache implementation in the future.
-            multiple_of_128 = ((self.qk_head_dim - 1) // 128 + 1) * 128
-            q_TNH = jnp.pad(q_TNH, ((0, 0), (0, 0),
-                                    (0, multiple_of_128 - self.qk_head_dim)))
-            k_SNH = jnp.pad(k_SNH, ((0, 0), (0, 0),
-                                    (0, multiple_of_128 - self.qk_head_dim)))
-            v_SNH = jnp.pad(v_SNH, ((0, 0), (0, 0),
-                                    (0, multiple_of_128 - self.v_head_dim)))
+            if not self.use_mla_kernel:  # MLA kernel handles padding
+                multiple_of_128 = ((self.qk_head_dim - 1) // 128 + 1) * 128
+                q_TNH = jnp.pad(q_TNH,
+                                ((0, 0), (0, 0),
+                                 (0, multiple_of_128 - self.qk_head_dim)))
+                k_SNH = jnp.pad(k_SNH,
+                                ((0, 0), (0, 0),
+                                 (0, multiple_of_128 - self.qk_head_dim)))
+                v_SNH = jnp.pad(v_SNH,
+                                ((0, 0), (0, 0),
+                                 (0, multiple_of_128 - self.v_head_dim)))
             q_scale = k_scale = v_scale = None
-            if self.kv_cache_quantized_dtype:
-                # TODO(kyuyeunk/jacobplatin): Enable w8a8 when VREG spill issue is resolved.
-                # q_scale = self._q_scale
-                k_scale = self._k_scale
-                v_scale = self._v_scale
-                k_SNH, v_SNH = utils.quantize_kv(k_SNH, v_SNH,
-                                                 self.kv_cache_quantized_dtype,
-                                                 k_scale, v_scale)
-            new_kv_cache, outputs_TNH = self.attention(
-                is_prefill,
-                kv_cache,
-                q_TNH,
-                k_SNH,
-                v_SNH,
-                attention_metadata,
-                self.mesh,
-                q_scale,
-                k_scale,
-                v_scale,
-            )
-            # TODO(wenxindongwork): For now, unpad the outputs_TNH to match the v_head_dim.
-            # We shall add the MLA kv cache implementation in the future.
-            outputs_TNH = outputs_TNH[..., :self.v_head_dim]
-        with jax.named_scope("o_proj"):
-            o_TD = jnp.einsum("TNH,NHD -> TD", outputs_TNH,
-                              self.kernel_o_proj_NHD.value)
-            o_TD = nnx.with_sharding_constraint(
-                o_TD, self.activation_attention_out_td)
-        return new_kv_cache, o_TD
+            # TODO(gpolovets): MLA does not currently support quantized KV!
+            if not self.use_mla_kernel:
+                if self.kv_cache_quantized_dtype:
+                    # TODO(kyuyeunk/jacobplatin): Enable w8a8 when VREG spill issue is resolved.
+                    k_scale = self._k_scale
+                    v_scale = self._v_scale
+                    k_SNH, v_SNH = quantize_kv(self.kv_cache_quantized_dtype,
+                                               k_SNH, v_SNH, k_scale, v_scale)
+                new_kv_cache, outputs_TNH = self.attention(
+                    is_prefill,
+                    kv_cache,
+                    q_TNH,
+                    k_SNH,
+                    v_SNH,
+                    attention_metadata,
+                    self.mesh,
+                    q_scale,
+                    k_scale,
+                    v_scale,
+                )
+                # TODO(wenxindongwork): For now, unpad the outputs_TNH to match the v_head_dim.
+                # We shall add the MLA kv cache implementation in the future.
+                outputs_TNH = outputs_TNH[..., :self.v_head_dim]
+            else:
+                new_kv_cache, outputs_TNA = self.mla_attention(
+                    kv_cache,
+                    q_TNA,
+                    q_rope_TNH,
+                    kv_SA,
+                    k_rope_SH,
+                    attention_metadata,
+                    self.mesh,
+                )
+                outputs_TNH = jnp.einsum("TNA,ANH -> TNH", outputs_TNA,
+                                         self.kernel_v_up_proj_ANH.value)
+            with jax.named_scope("o_proj"):
+                outputs_TNH = nnx.with_sharding_constraint(
+                    outputs_TNH, self.activation_attention_out_td)
+                outputs_TR = outputs_TNH.reshape(outputs_TNH.shape[0],
+                                                 self.N * self.v_head_dim)
+                o_TD = jnp.einsum("TR,RD -> TD", outputs_TR,
+                                  self.kernel_o_proj_RD.value)
+            return new_kv_cache, o_TD
     def attention(
         self,
@@ -326,21 +406,22 @@ class MLA(nnx.Module):
         out_specs = (self.attn_o_tnh, P(None, None, "model"))
         def _ragged_paged_attention(*args):
-            return ragged_paged_attention(
+            outputs = ragged_paged_attention(
                 *args,
                 sm_scale=self.scale,
                 q_scale=q_scale,
                 k_scale=k_scale,
                 v_scale=v_scale,
             )
+            return outputs
         output_TNH, kv_cache = jax.jit(
-            shard_map.shard_map(
+            jax.shard_map(
                 _ragged_paged_attention,
                 mesh=mesh,
                 in_specs=in_specs,
                 out_specs=out_specs,
-                check_rep=False,
+                check_vma=False,
             ))(
                 q_TNH,
                 k_SKH,
@@ -352,3 +433,115 @@ class MLA(nnx.Module):
                 md.request_distribution,
             )
         return kv_cache, output_TNH
+    def mla_attention(
+        self,
+        kv_cache: KVCache,
+        q_TNA: jax.Array,
+        q_rope_TNH: jax.Array,
+        k_SA: jax.Array,
+        k_rope_SH: jax.Array,
+        attention_metadata: AttentionMetadata,
+        mesh: Mesh,
+    ) -> Tuple[KVCache, jax.Array]:
+        """Performs scaled dot-product attention and updates the KV cache.
+        This function handles the core attention logic, which varies between
+        prefill and generation modes. In prefill, it computes self-attention
+        over the input sequence with a causal mask. In generation, it attends
+        to the full history of keys and values stored in the cache.
+        Args:
+            kv_cache: The key-value cache to be updated and used.
+            q_TNA: Query tensor of shape `(query_seq, num_attention_heads, lkv_dim)`.
+            q_rope_TNH: Query rope tensor of shape `(query_seq, num_attention_heads, rope_dim)`.
+            k_SA: Key tensor of shape `(kv_seq, lkv_dim)`.
+            k_rope_SH: Key rope tensor of shape `(kv_seq, rope_dim)`.
+            attention_metadata: Metadata containing sequence lengths.
+            mesh: The JAX device mesh (unused in this specific function but
+                kept for potential future use or API consistency).
+            q_scale: Quantization scale for q.
+            k_scale: Quantization scale for k.
+            v_scale: Quantization scale for v.
+        Returns:
+            A tuple containing:
+                - The updated KV cache.
+                - The attention output tensor of shape
+                  `(seq, num_q_heads, head_dim)`.
+        """
+        md = attention_metadata
+        in_specs = (
+            self.query_tnh,  # q
+            self.query_tnh,  # q_rope
+            self.keyvalue_skh,  # k
+            self.keyvalue_skh,  # k_rope
+            P(ShardingAxisName.MLP_TENSOR),  # kv_cache
+            P(ShardingAxisName.ATTN_DATA),  # md.seq_lens: Replicated
+            P(ShardingAxisName.ATTN_DATA),  # page_indices_flat: Replicated
+            P(ShardingAxisName.ATTN_DATA),  # query_start_loc: Replicated
+            P(ShardingAxisName.ATTN_DATA),  # distribution: Replicated
+        )
+        out_specs = (self.attn_o_tnh, P(ShardingAxisName.MLP_TENSOR))
+        def _mla_ragged_paged_attention(q, q_rope, k, k_rope, kv_cache, *args):
+            def _initialize_block_sizes():
+                # Set reasonable starting estimates for block sizes. (TODO(gpolovets): update this to use tuned sizes)
+                # Referring to get_tuned_block_sizes() in kernels/ragged_paged_attention/v3/tuned_block_sizes.py: 'TPU v7'/128/'q_bfloat16_kv_bfloat16/q_head-128_kv_head-1_head-128'/4096
+                max_num_tokens = q.shape[0]
+                max_num_seqs = md.seq_lens.shape[0]
+                num_page_indices = md.block_tables.shape[0]
+                assert num_page_indices % max_num_seqs == 0
+                pages_per_seq = num_page_indices // max_num_seqs
+                # num_kv_pages_per_block = min(pages_per_seq, 16)
+                bkv_p, bq_sz = get_tuned_block_sizes(
+                    q.dtype,
+                    kv_cache.dtype,
+                    self.num_attention_heads,
+                    1,
+                    self.qk_nope_head_dim,
+                    kv_cache.shape[1],  # page size
+                    max_num_tokens,
+                    pages_per_seq,
+                )
+                num_kv_pages_per_block = min(min(pages_per_seq, bkv_p), 4)
+                num_queries_per_block = min(min(max_num_tokens, bq_sz),
+                                            4)  # OOMS at 8
+                return num_kv_pages_per_block, num_queries_per_block
+            num_kv_pages_per_block, num_queries_per_block = _initialize_block_sizes(
+            )
+            output, kv_cache = mla_ragged_paged_attention(
+                q,
+                q_rope,
+                k,
+                k_rope,
+                kv_cache,
+                *args,
+                sm_scale=self.scale,
+                num_kv_pages_per_block=num_kv_pages_per_block,
+                num_queries_per_block=num_queries_per_block)
+            return kv_cache, output
+        kv_cache, output_TNH = jax.jit(
+            jax.shard_map(
+                _mla_ragged_paged_attention,
+                mesh=mesh,
+                in_specs=in_specs,
+                out_specs=out_specs,
+                check_vma=False,
+            ), )(
+                q_TNA,
+                q_rope_TNH,
+                k_SA,
+                k_rope_SH,
+                kv_cache,
+                md.seq_lens,
+                md.block_tables,
+                md.query_start_loc,
+                md.request_distribution,
+            )
+        return kv_cache, output_TNH

tpu_inference/layers/jax/attention/gpt_oss_attention.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from dataclasses import InitVar, dataclass
 from typing import Tuple
@@ -5,7 +19,6 @@ import jax
 import jax.numpy as jnp
 from flax import nnx
 from flax.typing import Sharding
-from jax.experimental import shard_map
 from jax.sharding import Mesh
 from jax.sharding import PartitionSpec as P
@@ -13,6 +26,7 @@ from tpu_inference import utils
 from tpu_inference.kernels.ragged_paged_attention.v3.kernel_hd64 import \
     ragged_paged_attention_hd64
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.quantization import quantize_kv
 from tpu_inference.layers.jax.base import create_param
 from tpu_inference.layers.jax.rope import GptOssRotaryEmbedding
@@ -158,17 +172,17 @@ class GptOssAttention(nnx.Module):
     ) -> Tuple[KVCache, jax.Array]:
         """Performs scaled dot-product attention by calling the ragged_paged_attention kernel."""
         md = attention_metadata
-        kv_cache_spec = P(None, None, "model")
+        kv_cache_spec = P("data", None, "model")
         in_specs = (
             self.query_tnh,  # q
             self.keyvalue_skh,  # k
             self.keyvalue_skh,  # v
             kv_cache_spec,  # kv_cache
-            P(),  # md.seq_lens: Replicated
-            P(),  # page_indices_flat: Replicated
-            P(),  # query_start_loc: Replicated
-            P(),  # distribution: Replicated
+            P("data"),  # md.seq_lens
+            P("data"),  # page_indices_flat
+            P("data"),  # query_start_loc
+            P("data"),  # distribution
             P(('model')),  # sinks
         )
         out_specs = (self.attn_o_tnh, kv_cache_spec)
@@ -185,12 +199,12 @@ class GptOssAttention(nnx.Module):
             )
         output_TNH, kv_cache = jax.jit(
-            shard_map.shard_map(
+            jax.shard_map(
                 _ragged_paged_attention_wrapper,
                 mesh=mesh,
                 in_specs=in_specs,
                 out_specs=out_specs,
-                check_rep=False,
+                check_vma=False,
             ))(
                 q_TNH,
                 k_SKH,
@@ -235,9 +249,8 @@ class GptOssAttention(nnx.Module):
             # q_scale = self._q_scale
             k_scale = self._k_scale
             v_scale = self._v_scale
-            k_TKH, v_TKH = utils.quantize_kv(k_TKH, v_TKH,
-                                             self.kv_cache_quantized_dtype,
-                                             k_scale, v_scale)
+            k_TKH, v_TKH = quantize_kv(self.kv_cache_quantized_dtype, k_TKH,
+                                       v_TKH, k_scale, v_scale)
         with jax.named_scope("attn_op"):
             new_kv_cache, attn_out_TNH = self.attention(

tpu_inference/layers/jax/attention/llama4_attention.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from dataclasses import dataclass
 import jax
@@ -5,8 +19,8 @@ import jax.numpy as jnp
 from flax import nnx
 from jax.sharding import Sharding
-from tpu_inference import utils
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.quantization import quantize_kv
 from tpu_inference.layers.jax.attention.attention import Attention, KVCache
 from tpu_inference.layers.jax.rope_interface import apply_rope
 from tpu_inference.logger import init_logger
@@ -114,9 +128,8 @@ class Llama4Attention(Attention):
             # q_scale = self._q_scale
             k_scale = self._k_scale
             v_scale = self._v_scale
-            k_SKH, v_SKH = utils.quantize_kv(k_SKH, v_SKH,
-                                             self.kv_cache_quantized_dtype,
-                                             k_scale, v_scale)
+            k_SKH, v_SKH = quantize_kv(self.kv_cache_quantized_dtype, k_SKH,
+                                       v_SKH, k_scale, v_scale)
         with jax.named_scope("attn_op"):
             new_kv_cache, outputs_TNH = self.attention(

tpu_inference/layers/jax/base.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import dataclasses
 from dataclasses import dataclass, fields
 from typing import Any, Callable, Mapping

tpu_inference/layers/jax/constants.py CHANGED Viewed

@@ -1,3 +1,16 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Current Used Abbreviation for Tensor Dimensions:
 B: Batch size

tpu_inference/layers/jax/layers.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from dataclasses import InitVar, dataclass
 from typing import Any

tpu_inference/layers/jax/misc.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from typing import Tuple

tpu_inference/layers/jax/moe/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu-inference 0.11.1.dev202512030818__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl