PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tpu_inference/layers/jax/sample/sampling_metadata.py ADDED Viewed

@@ -0,0 +1,90 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+from dataclasses import dataclass
+from typing import Optional
+import jax
+import jax.numpy as jnp
+import torch
+from jax.sharding import Mesh
+from tpu_inference.runner.input_batch import InputBatch
+from tpu_inference.utils import device_array
+DEFAULT_SAMPLING_PARAMS = dict(
+    temperature=-1.0,
+    top_k=0,
+    top_p=1.0,
+)
+@functools.partial(
+    jax.tree_util.register_dataclass,
+    data_fields=[
+        "temperature",
+        "top_k",
+        "top_p",
+    ],
+    meta_fields=["do_sampling", "logprobs"],
+)
+@dataclass
+class TPUSupportedSamplingMetadata:
+    temperature: Optional[jnp.ndarray] = None
+    top_k: Optional[jnp.ndarray] = None
+    top_p: Optional[jnp.ndarray] = None
+    do_sampling: bool = False
+    logprobs: bool = False
+    @classmethod
+    def from_input_batch(
+        cls,
+        mesh: Mesh,
+        input_batch: InputBatch,
+        padded_num_reqs: int,
+        sharding: Optional[jax.sharding.Sharding] = None,
+    ) -> "TPUSupportedSamplingMetadata":
+        needs_logprobs = input_batch.max_num_logprobs > 0 if input_batch.max_num_logprobs else False
+        if input_batch.all_greedy:
+            return cls(do_sampling=False, logprobs=needs_logprobs)
+        num_reqs = input_batch.num_reqs
+        def fill_slice(cpu_torch_tensor: torch.Tensor,
+                       fill_val: float) -> torch.Tensor:
+            # Pad value is the default one.
+            cpu_torch_tensor[num_reqs:padded_num_reqs] = fill_val
+            return cpu_torch_tensor
+        temp_tensor = fill_slice(input_batch.temperature_cpu,
+                                 DEFAULT_SAMPLING_PARAMS["temperature"])
+        top_k_tensor = fill_slice(input_batch.top_k_cpu,
+                                  DEFAULT_SAMPLING_PARAMS["top_k"])
+        top_p_tensor = fill_slice(input_batch.top_p_cpu,
+                                  DEFAULT_SAMPLING_PARAMS["top_p"])
+        # Slice persistent device tensors to a fixed pre-compiled padded shape.
+        return cls(
+            temperature=device_array(mesh,
+                                     temp_tensor[:padded_num_reqs],
+                                     sharding=sharding),
+            top_p=device_array(mesh,
+                               top_p_tensor[:padded_num_reqs],
+                               sharding=sharding),
+            top_k=device_array(mesh,
+                               top_k_tensor[:padded_num_reqs],
+                               sharding=sharding),
+            do_sampling=not input_batch.all_greedy,
+            logprobs=needs_logprobs,
+        )

tpu_inference/layers/jax/transformer_block.py ADDED Viewed

@@ -0,0 +1,121 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple
+# Flax and JAX sharding imports
+import jax
+from flax import nnx
+from tpu_inference.layers.jax.attention.attention import (AttentionMetadata,
+                                                          KVCache)
+from tpu_inference.layers.jax.layers import DenseFFW
+from tpu_inference.layers.jax.moe.moe import MoE
+@dataclass(kw_only=True)
+class TransformerBlock(nnx.Module):
+    """
+    A heavy weight module which serves as the stateful live blocks in serving
+    custom_module can be either a dense module (i.e., DenseFFW) or MoE.
+    """
+    pre_attention_norm: nnx.Module
+    pre_mlp_norm: nnx.Module
+    custom_module: Optional[nnx.Module] = None
+    attn: nnx.Module
+    use_attention_rope: bool = True
+    quant: Any | None = None
+    def __call__(
+            self, x_TD: jax.Array, is_prefill: bool, kv_cache: KVCache,
+            attention_metadata: AttentionMetadata
+    ) -> Tuple[KVCache, jax.Array]:
+        # Attn Block
+        attn_residual_TD = x_TD
+        x_TD = self.pre_attention_norm(x_TD)
+        new_cache, attn_output_TD = self.attn(x_TD, is_prefill, kv_cache,
+                                              attention_metadata,
+                                              self.use_attention_rope)
+        attn_output_TD += attn_residual_TD
+        # FFW Block
+        ffw_residual_TD = attn_output_TD
+        normed_ffw_input_TD = self.pre_mlp_norm(attn_output_TD)
+        logits_TD = self.custom_module(normed_ffw_input_TD)
+        logits_TD += ffw_residual_TD
+        return new_cache, logits_TD
+@dataclass(kw_only=True)
+class SharedExpertsTransformerBlock(TransformerBlock):
+    """Create a modified TransformerBlock that sums MoE layer output with shared expert output.
+    Users can provide the FFW layer in two ways:
+    1.  Pass the module (either `MoE` or `DenseFFW`) to the `custom_module`
+        attribute.
+    2.  Specify the `moe_ffw` or `dense_ffw` attributes
+        (e.g., for passing quantized modules).
+    Attributes:
+        moe_ffw: Optional MoE layer.
+        dense_ffw: Optional DFF layer.
+        shared_experts: Optional shared experts module, used if MoE is enabled.
+    If an `MoE` layer is used (from either path), its output is summed
+    with the `shared_experts` module.
+    """
+    moe_ffw: Optional[MoE] = None
+    dense_ffw: Optional[DenseFFW] = None
+    shared_experts: Optional[DenseFFW] = None
+    def __call__(self, x_TD, is_prefill, kv_cache, attention_metadata):
+        # Attn Block
+        attn_residual_TD = x_TD
+        x_TD = self.pre_attention_norm(x_TD)
+        new_cache, attn_output_TD = self.attn(x_TD, is_prefill, kv_cache,
+                                              attention_metadata,
+                                              self.use_attention_rope)
+        attn_output_TD += attn_residual_TD
+        # FFW Block
+        ffw_residual_TD = attn_output_TD
+        normed_ffw_input_TD = self.pre_mlp_norm(attn_output_TD)
+        if isinstance(self.custom_module, MoE):
+            moe_layer = self.custom_module
+        else:
+            moe_layer = self.moe_ffw
+        if isinstance(self.custom_module, DenseFFW):
+            dense_layer = self.custom_module
+        else:
+            dense_layer = self.dense_ffw
+        if moe_layer is not None:
+            logits_TD = moe_layer(normed_ffw_input_TD)
+            # Add the shared expert outputs to the MoE outputs.
+            shared_expert_output_TD = self.shared_experts(normed_ffw_input_TD)
+            logits_TD += shared_expert_output_TD
+        elif dense_layer is not None:
+            logits_TD = dense_layer(normed_ffw_input_TD)
+        else:
+            raise ValueError(
+                "Neither custom_module, moe_ffw nor dense_ffw attribute is set for this SharedExpertsTransformerBlock!"
+            )
+        logits_TD += ffw_residual_TD
+        return new_cache, logits_TD

tpu_inference/layers/vllm/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/layers/vllm/attention.py ADDED Viewed

@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
+from typing import Optional, Tuple
+import jax
+import jax.numpy as jnp
+import torch
+from jax.sharding import Mesh
+from torchax.interop import jax_view, torch_view
+from torchax.ops.mappings import t2j
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer, AttentionType)
+from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
+from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.quantization import quantize_kv
+from tpu_inference.logger import init_logger
+from tpu_inference.models.vllm.vllm_model_wrapper_context import \
+    get_vllm_model_wrapper_context
+logger = init_logger(__name__)
+class PallasAttentionBackend(AttentionBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "PALLAS"
+    @staticmethod
+    def get_impl_cls() -> type["PallasAttentionBackendImpl"]:
+        return PallasAttentionBackendImpl
+class PallasAttentionBackendImpl(AttentionImpl):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: str | None = None,
+        sinks: torch.Tensor | None = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
+        self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        if alibi_slopes is not None:
+            raise NotImplementedError("Alibi slopes is not supported.")
+        self.kv_cache_quantized_dtype = None
+        if kv_cache_dtype != "auto":
+            self.kv_cache_quantized_dtype = utils.get_jax_dtype_from_str_dtype(
+                kv_cache_dtype)
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+        self.sinks = sinks
+        if self.sinks is not None:
+            assert self.sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                "heads in the layer")
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        #TODO (kyuyeunk): Shard the sinks along num_heads dim
+        if self.sinks is not None:
+            sinks = t2j(self.sinks, use_dlpack=False)
+            sinks = torch_view(sinks.astype(jnp.float32))
+            self.sinks = torch.nn.Parameter(sinks, requires_grad=False)
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported for "
+                "PallasAttentionBackendImpl")
+        if kv_cache.numel():
+            raise RuntimeError(
+                "KV cache from vLLM Attention layer should be empty but has "
+                "the size of %s.", kv_cache.numel())
+        del kv_cache  # Use kv_cache from vllm wrapper context values instead.
+        vllm_model_wrapper_context = get_vllm_model_wrapper_context()
+        kv_cache_index = vllm_model_wrapper_context.layer_name_to_kvcache_index[
+            layer.layer_name]
+        kv_cache = vllm_model_wrapper_context.kv_caches[kv_cache_index]
+        mesh = vllm_model_wrapper_context.mesh
+        query, key, value = jax_view(query), jax_view(key), jax_view(value)
+        q_scale = k_scale = v_scale = None
+        if self.kv_cache_quantized_dtype:
+            key, value = quantize_kv(self.kv_cache_quantized_dtype, key, value,
+                                     layer._k_scale_float,
+                                     layer._v_scale_float)
+            # TODO(kyuyeunk): Enable w8a8 when VREG spill issue is resolved.
+            # q_scale = layer._q_scale_float
+            k_scale = layer._k_scale_float
+            v_scale = layer._v_scale_float
+        sinks = jax_view(self.sinks)
+        new_kv_cache, outputs = _jax_attn_func(
+            kv_cache,
+            query,
+            key,
+            value,
+            sinks,
+            attn_metadata,
+            mesh,
+            self.scale,
+            self.head_size,
+            self.num_heads,
+            self.num_kv_heads,
+            q_scale,
+            k_scale,
+            v_scale,
+            self.sliding_window,
+        )
+        vllm_model_wrapper_context.kv_caches[kv_cache_index] = new_kv_cache
+        return torch_view(outputs)
+@functools.partial(
+    jax.jit,
+    static_argnames=(
+        "mesh",
+        "scale",
+        "head_size",
+        "num_heads",
+        "num_kv_heads",
+        "q_scale",
+        "k_scale",
+        "v_scale",
+        "sliding_window",
+    ),
+    donate_argnames=("kv_cache"),
+)
+def _jax_attn_func(
+    kv_cache: jax.Array,
+    q: jax.Array,
+    k: jax.Array,
+    v: jax.Array,
+    sinks: jax.Array | None,
+    attention_metadata: AttentionMetadata,
+    mesh: Mesh,
+    scale: float,
+    head_size: int,
+    num_heads: int,
+    num_kv_heads: int,
+    q_scale: float | None = None,
+    k_scale: float | None = None,
+    v_scale: float | None = None,
+    sliding_window: int | None = None,
+) -> Tuple[jax.Array, jax.Array]:
+    del scale  # Unused for now, as the attention function applies a default scale.
+    # Get shapes from vllm
+    q_len, q_compute_dim = q.shape
+    k_len, k_compute_dim = k.shape
+    assert k.shape == v.shape
+    assert q_compute_dim == head_size * num_heads
+    assert k_compute_dim == head_size * num_kv_heads
+    # Convert the shapes from vLLM's convetion to what the attention function expects
+    # bs, num_heads, q_len, head_size
+    q = q.reshape(q_len, num_heads, head_size)
+    # bs, num_kv_heads, k_len, head_size
+    k = k.reshape(k_len, num_kv_heads, head_size)
+    v = v.reshape(k_len, num_kv_heads, head_size)
+    new_kv_cache, outputs = attention(
+        kv_cache,
+        q,
+        k,
+        v,
+        attention_metadata,
+        mesh,
+        q_scale=q_scale,
+        k_scale=k_scale,
+        v_scale=v_scale,
+        sinks=sinks,
+        attention_chunk_size=sliding_window,
+    )
+    # Convert the shape back to vLLM's convention
+    assert outputs.shape[0] == q_len
+    assert outputs.shape[1] == num_heads
+    assert outputs.shape[2] == head_size
+    outputs = outputs.reshape(q_len, q_compute_dim)
+    return new_kv_cache, outputs