PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tpu_inference/spec_decode/jax/eagle3.py ADDED Viewed

@@ -0,0 +1,430 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implements the Eagle3 proposer for speculative decoding on JAX/TPU."""
+import functools
+from dataclasses import replace
+from typing import Any, Optional
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax import nnx
+from jax import lax
+from jax.sharding import NamedSharding, PartitionSpec
+from vllm.config import VllmConfig
+from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.logger import init_logger
+from tpu_inference.models.common.model_loader import get_model
+from tpu_inference.runner import utils as runner_utils
+from tpu_inference.utils import device_array
+logger = init_logger(__name__)
+class Eagle3Proposer:
+    """A proposer for speculative decoding using the Eagle3 method.
+    This class is responsible for loading the draft model and generating draft
+    tokens based on the target model's outputs.
+    """
+    def __init__(
+            self,
+            vllm_config: VllmConfig,
+            runner: Any,  # TPUModelRunner
+    ):
+        """Initializes the Eagle3Proposer.
+        Args:
+            vllm_config: The vLLM configuration.
+            runner: The TPUModelRunner instance.
+        """
+        self.vllm_config = vllm_config
+        self.speculative_config = vllm_config.speculative_config
+        assert self.speculative_config is not None
+        self.draft_model_config = self.speculative_config.draft_model_config
+        self.method = self.speculative_config.method
+        self.runner = runner
+        self.mesh = runner.mesh
+        self.num_speculative_tokens = (
+            self.speculative_config.num_speculative_tokens)
+        self.block_size = vllm_config.cache_config.block_size
+        self.rng_key = jax.random.key(self.vllm_config.model_config.seed)
+        self.max_num_tokens = runner.max_num_tokens
+        self.token_arange = jnp.arange(self.max_num_tokens)
+    def load_model(self, target_model: Any) -> None:
+        """Loads the draft model."""
+        self.model_fn, self.compute_logits_fn, self.combine_hidden_states_fn, _, self.state, _, _ = get_model(
+            self.vllm_config, self.rng_key, self.mesh, is_draft_model=True)
+        draft_embed_tokens = getattr(self.state.model, 'embed_tokens', None)
+        if draft_embed_tokens is None or ~jnp.any(
+                draft_embed_tokens.embedding):
+            logger.info(
+                "Draft model does not have embedding. Setting draft model's embed_tokens to target model's embed"
+            )
+            self.state.model.embed_tokens = target_model.model.embed
+        elif jnp.array_equal(draft_embed_tokens.embedding,
+                             target_model.model.embed.embedding):
+            logger.info(
+                "Draft model's embed_tokens is identical to target model's embed. Sharing the embedding."
+            )
+            self.state.model.embed_tokens = target_model.model.embed
+        else:
+            logger.info("Draft model has its own embed_tokens.")
+    @functools.partial(jax.jit, static_argnums=(0, ))
+    def _prepare_input_ids(
+            self, query_start_loc: jax.Array, target_token_ids: jax.Array,
+            next_token_ids: jax.Array,
+            num_reqs: jax.Array) -> tuple[jnp.ndarray, jnp.ndarray]:
+        """JIT-compiled helper for preparing the input IDs for the draft model."""
+        last_token_indices = query_start_loc[1:] - 1
+        # Shift the input ids by one token.
+        rolled_input_ids = jnp.roll(target_token_ids, -1, axis=0)
+        # To make the update JIT-compatible with a dynamic `num_reqs`, we perform a
+        # scatter update of a static size, using a mask to handle the dynamic part.
+        max_num_reqs = last_token_indices.shape[0]
+        mask = jnp.arange(max_num_reqs) < num_reqs
+        # For padded requests (where mask is False), we use the original value from
+        # the rolled array, making the update a no-op for them.
+        original_values_at_indices = rolled_input_ids[last_token_indices]
+        values_to_set = jnp.where(mask, next_token_ids,
+                                  original_values_at_indices)
+        input_ids = rolled_input_ids.at[last_token_indices].set(values_to_set)
+        return input_ids, last_token_indices
+    @functools.partial(jax.jit, static_argnums=(0, ))
+    def _update_inputs_for_loop_speculation(
+        self, positions: jax.Array, seq_lens: jax.Array,
+        block_tables: jax.Array
+    ) -> tuple[jax.Array, jax.Array, jax.Array, jax.Array, jax.Array]:
+        """JIT-compiled helper for preparing inputs in the loop of prediction."""
+        positions += 1
+        exceeds_max_model_len = positions >= self.runner.max_model_len
+        clamped_positions = jnp.where(exceeds_max_model_len, 0, positions)
+        new_seq_lens = seq_lens + 1
+        new_seq_lens = jnp.minimum(new_seq_lens, self.runner.max_model_len)
+        new_seq_lens = jnp.where(exceeds_max_model_len, 1, new_seq_lens)
+        num_reqs = seq_lens.shape[0]
+        query_start_loc = jnp.arange(num_reqs + 1)
+        # Compute the slot mapping.
+        # NOTE(woosuk): We should handle the case where the draft model
+        # generates tokens beyond the max model length. Since it is complex
+        # to remove such requests from the batch, we keep them in the batch
+        # but adjust the position ids and slot mappings to avoid the
+        # out-of-range access during the model execution. The draft tokens
+        # generated with this adjustment should be ignored.
+        max_num_blocks_per_req = block_tables.shape[0] // num_reqs
+        expanded_exceeds_mask = jnp.repeat(exceeds_max_model_len,
+                                           max_num_blocks_per_req)
+        new_block_tables = jnp.where(expanded_exceeds_mask, -1, block_tables)
+        positions = lax.with_sharding_constraint(
+            positions, NamedSharding(self.mesh, PartitionSpec(None, )))
+        clamped_positions = lax.with_sharding_constraint(
+            clamped_positions, NamedSharding(self.mesh, PartitionSpec(None, )))
+        new_seq_lens = lax.with_sharding_constraint(
+            new_seq_lens, NamedSharding(self.mesh, PartitionSpec(None, )))
+        query_start_loc = lax.with_sharding_constraint(
+            query_start_loc, NamedSharding(self.mesh, PartitionSpec()))
+        new_block_tables = lax.with_sharding_constraint(
+            new_block_tables, NamedSharding(self.mesh, PartitionSpec(None, )))
+        return positions, clamped_positions, new_seq_lens, query_start_loc, new_block_tables
+    @functools.partial(jax.jit, static_argnums=(0, ))
+    def _stack_draft_token_ids(
+            self, draft_token_ids_list: list[jax.Array]) -> jnp.ndarray:
+        """JIT-compiled helper for stacking draft token IDs."""
+        return jnp.stack(draft_token_ids_list, axis=1)
+    @functools.partial(jax.jit, static_argnums=(0, ))
+    def _prepare_hidden_states_and_input_ids(
+        self,
+        state: nnx.State,
+        aux_hidden_states: tuple[jax.Array, ...],
+        query_start_loc: jax.Array,
+        target_token_ids: jax.Array,
+        next_token_ids: jax.Array,
+        num_reqs: jax.Array,
+    ) -> tuple[jax.Array, jax.Array, jax.Array]:
+        target_hidden_states = jnp.concatenate(aux_hidden_states, axis=-1)
+        target_hidden_states = self.combine_hidden_states_fn(
+            state, target_hidden_states)
+        input_ids, last_token_indices = self._prepare_input_ids(
+            query_start_loc, target_token_ids, next_token_ids, num_reqs)
+        # NOTE(pooyam): For now, we don't support multimodal.
+        return target_hidden_states, input_ids, last_token_indices
+    def prepare_inputs(
+        self,
+        attn_metadata: AttentionMetadata,
+        input_ids: jax.Array,
+        aux_hidden_states: tuple[jax.Array, ...],
+        next_token_ids: jax.Array,
+        num_rejected_tokens: Optional[jax.Array] = None,
+    ) -> tuple[jax.Array, jax.Array, jax.Array, AttentionMetadata]:
+        """Prepare drafter inputs based on target forward outputs.
+        Mirrors the GPU reference logic but adapted to TPU/JAX types:
+        - When no rejection happened, select the first N scheduled tokens.
+        - When rejections happened, trim the per-request tail tokens and
+          update attention metadata accordingly.
+        - Build the EAGLE3 hidden input by concatenating auxiliary hidden
+          states along the last dimension.
+        Returns updated AttentionMetadata (positions, query_start_loc, seq_lens)
+        and the selected `target_token_ids` and `target_hidden_states`.
+        """
+        assert aux_hidden_states is not None and len(aux_hidden_states) > 0, (
+            "EAGLE3 requires auxiliary hidden states from the target model.")
+        # The last KV cache group is for the draft model.
+        num_kv_cache_groups = len(self.runner.kv_cache_config.kv_cache_groups)
+        draft_kv_cache_group_id = num_kv_cache_groups - 1
+        block_tables = self.runner.input_batch.block_table[
+            draft_kv_cache_group_id].get_cpu_tensor().reshape(-1)
+        # Number of active requests in this step (un-padded count).
+        num_reqs = self.runner.input_batch.num_reqs
+        if num_rejected_tokens is None:
+            num_reqs = device_array(self.mesh,
+                                    np.asarray([num_reqs], dtype=jnp.int32))
+            # block_tables = device_array(self.mesh, block_tables)
+            attn_metadata = replace(attn_metadata,
+                                    block_tables=device_array(
+                                        self.mesh, block_tables))
+            target_hidden_states, input_ids, last_token_indices = self._prepare_hidden_states_and_input_ids(
+                self.state, aux_hidden_states, attn_metadata.query_start_loc,
+                input_ids, next_token_ids, num_reqs)
+            return target_hidden_states, input_ids, last_token_indices, attn_metadata
+        # Host copies from the metadata prepared by the runner.
+        query_start_loc_cpu = attn_metadata.query_start_loc_cpu
+        seq_lens_cpu = attn_metadata.seq_lens_cpu
+        assert query_start_loc_cpu is not None and seq_lens_cpu is not None
+        # Rejection-aware path: compute new per-request lengths and token indices.
+        # Convert to host numpy for efficient prefix-sum and repeat ops.
+        nrt_cpu = jax.device_get(num_rejected_tokens).astype("int32")
+        # query_len_per_req = [q1, q2, ...]
+        query_len_per_req = (query_start_loc_cpu[1:] -
+                             query_start_loc_cpu[:-1])
+        # query_start_loc_cpu and consequentaly query_len_per_req are padded
+        # For padded requests, the query length should be 0.
+        query_len_per_req[num_reqs:] = 1
+        # num_tokens_per_req = [q1 - n1, q2 - n2, ...]
+        num_tokens_per_req = (query_len_per_req - nrt_cpu)
+        assert (num_tokens_per_req
+                >= 0).all(), ("num_tokens_per_req must be non-negative")
+        # new_query_start_loc = [0, q1-n1, q1+q2-n1-n2, ...]
+        # Use numpy for cumsum and then convert back.
+        new_query_start_loc_cpu = np.zeros_like(query_start_loc_cpu)
+        np.cumsum(num_tokens_per_req, out=new_query_start_loc_cpu[1:])
+        # Build token indices selecting the kept tokens from each request.
+        total_num_tokens = int(new_query_start_loc_cpu[-1])
+        # Pad to total_num_tokens.
+        padded_total_num_tokens = runner_utils.get_padded_token_len(
+            self.runner.num_tokens_paddings, total_num_tokens)
+        pad_width = padded_total_num_tokens - total_num_tokens
+        assert pad_width >= 0, (
+            f"total_num_tokens {total_num_tokens} exceeds "
+            f"num_tokens_paddings {self.runner.num_tokens_paddings}")
+        # Expand request starts: [0, 0, q1-n1, ...,]
+        expanded_new_query_start_loc = np.repeat(new_query_start_loc_cpu[:-1],
+                                                 num_tokens_per_req)
+        # Offsets within each request window: [0,1,2, 0,1,2,3, ...]
+        token_offsets = np.arange(total_num_tokens, dtype=np.int32)
+        token_offsets -= expanded_new_query_start_loc
+        # Map into old flat indices by adding original request starts.
+        old_query_start_loc_expanded = np.repeat(query_start_loc_cpu[:-1],
+                                                 num_tokens_per_req)
+        token_indices_cpu = token_offsets + old_query_start_loc_expanded
+        token_indices_cpu = np.pad(token_indices_cpu, (0, pad_width),
+                                   "constant",
+                                   constant_values=0)
+        # Update seq_lens for active requests only: new_seq_lens = s - n.
+        new_seq_lens_cpu = seq_lens_cpu - nrt_cpu
+        query_start_loc, seq_lens, token_indices, num_reqs, block_tables = device_array(
+            self.mesh,
+            (new_query_start_loc_cpu, new_seq_lens_cpu, token_indices_cpu,
+             np.asarray([num_reqs], dtype=jnp.int32), block_tables))
+        attn_metadata = replace(attn_metadata, block_tables=block_tables)
+        return self._filter_token_and_prepare_initial_inputs(
+            self.state, token_indices, query_start_loc, seq_lens, input_ids,
+            aux_hidden_states, attn_metadata, next_token_ids, num_reqs)
+    @functools.partial(jax.jit, static_argnums=(0, ))
+    def _filter_token_and_prepare_initial_inputs(
+        self,
+        state: nnx.State,
+        token_indices: jax.Array,
+        query_start_loc: jax.Array,
+        seq_lens: jax.Array,
+        input_ids: jax.Array,
+        aux_hidden_states: tuple[jax.Array, ...],
+        attn_metadata: AttentionMetadata,
+        next_token_ids: jax.Array,
+        num_reqs: jax.Array,
+    ) -> tuple[jax.Array, jax.Array, jax.Array, AttentionMetadata]:
+        # Select tokens and hidden states.
+        target_token_ids = input_ids[token_indices]
+        # Update positions to match the selected tokens.
+        if attn_metadata.input_positions.ndim == 2:
+            input_positions = attn_metadata.input_positions[:, token_indices]
+        else:
+            input_positions = attn_metadata.input_positions[token_indices]
+        attn_metadata = AttentionMetadata(
+            input_positions=input_positions,
+            block_tables=attn_metadata.block_tables,
+            seq_lens=seq_lens,
+            query_start_loc=query_start_loc,
+            request_distribution=attn_metadata.request_distribution,
+        )
+        target_hidden_states, input_ids, last_token_indices = self._prepare_hidden_states_and_input_ids(
+            state, [h[token_indices] for h in aux_hidden_states],
+            query_start_loc, target_token_ids, next_token_ids, num_reqs)
+        return target_hidden_states, input_ids, last_token_indices, attn_metadata
+    @functools.partial(jax.jit, static_argnums=(0, ))
+    def _select_draft_token_ids(
+        self,
+        state: nnx.State,
+        hidden_states: jax.Array,
+        last_token_indices: jax.Array,
+    ) -> jax.Array:
+        sample_hidden_states = hidden_states[last_token_indices]
+        sample_hidden_states = lax.with_sharding_constraint(
+            sample_hidden_states,
+            NamedSharding(self.mesh, PartitionSpec(None, None)))
+        return self._get_draft_token_ids(state, sample_hidden_states)
+    @functools.partial(jax.jit, static_argnums=(0, ))
+    def _get_draft_token_ids(self, state: nnx.State,
+                             hidden_states: jax.Array) -> jax.Array:
+        lora_metadata = None
+        logits = self.compute_logits_fn(state, hidden_states, lora_metadata)
+        draft_token_ids = jnp.argmax(logits, axis=-1)
+        return lax.with_sharding_constraint(
+            draft_token_ids, NamedSharding(self.mesh, PartitionSpec()))
+    @functools.partial(jax.jit, static_argnums=(0, ))
+    def _select_inputs_for_loop_speculation(
+            self, state: nnx.State, positions: jax.Array, residual: jax.Array,
+            hidden_states: jax.Array,
+            last_token_indices: jax.Array) -> tuple[jax.Array, jax.Array]:
+        positions = positions[last_token_indices]
+        residual = residual[last_token_indices]
+        draft_token_ids = self._select_draft_token_ids(state, hidden_states,
+                                                       last_token_indices)
+        positions = lax.with_sharding_constraint(
+            positions, NamedSharding(self.mesh, PartitionSpec(None, )))
+        residual = lax.with_sharding_constraint(
+            residual, NamedSharding(self.mesh, PartitionSpec(None, None)))
+        draft_token_ids = lax.with_sharding_constraint(
+            draft_token_ids, NamedSharding(self.mesh, PartitionSpec()))
+        return positions, residual, draft_token_ids
+    def propose(
+        self,
+        kv_caches: list[jax.Array],
+        input_ids: jax.Array,
+        attn_metadata: AttentionMetadata,
+        last_token_indices,
+        target_hidden_states,
+    ) -> tuple[list[jax.Array], jnp.ndarray]:
+        """Proposes draft tokens using the draft model.
+        Returns:
+            A tuple containing the updated KV caches and a tensor of proposed
+            draft token IDs.
+        """
+        # input_ids and target_hidden_states for the first speculation have been prepared in prepare_inputs() to improve performance.
+        kv_caches, hidden_states, residual = self.model_fn(
+            self.state,
+            kv_caches,
+            input_ids,
+            target_hidden_states,
+            attn_metadata,
+        )
+        if self.num_speculative_tokens == 1:
+            return kv_caches, self._select_draft_token_ids(
+                self.state, hidden_states, last_token_indices)
+        positions, hidden_states, draft_token_ids = self._select_inputs_for_loop_speculation(
+            self.state, attn_metadata.input_positions, residual[0],
+            hidden_states, last_token_indices)
+        draft_token_ids_list = [draft_token_ids]
+        for _ in range(self.num_speculative_tokens - 1):
+            input_ids_loop = draft_token_ids_list[-1]
+            positions, clamped_positions, new_seq_lens, query_start_loc, new_block_tables = self._update_inputs_for_loop_speculation(
+                positions, attn_metadata.seq_lens, attn_metadata.block_tables)
+            attn_metadata = replace(
+                attn_metadata,
+                input_positions=clamped_positions,
+                seq_lens=new_seq_lens,
+                query_start_loc=query_start_loc,
+                block_tables=new_block_tables,
+            )
+            kv_caches, new_hidden_states, residual = self.model_fn(
+                self.state,
+                kv_caches,
+                input_ids_loop,
+                hidden_states,  # This should be the hidden_states from previous step
+                attn_metadata,
+            )
+            hidden_states = residual[0]
+            draft_token_ids = self._get_draft_token_ids(
+                self.state, new_hidden_states)
+            draft_token_ids_list.append(draft_token_ids)
+        # [batch_size, num_speculative_tokens]
+        draft_token_ids = self._stack_draft_token_ids(draft_token_ids_list)
+        return kv_caches, draft_token_ids

tpu_inference/tpu_info.py ADDED Viewed

@@ -0,0 +1,92 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import os
+import requests
+from tpu_inference import envs
+from tpu_inference.logger import init_logger
+logger = init_logger(__name__)
+GCE_TPU_ACCELERATOR_ENDPOINT = (
+    "http://metadata.google.internal/computeMetadata/v1/instance/attributes/")
+GCE_TPU_HEADERS = {"Metadata-Flavor": "Google"}
+def get_tpu_metadata(key: str = "") -> str:
+    try:
+        accelerator_type_request = requests.get(
+            os.path.join(GCE_TPU_ACCELERATOR_ENDPOINT, key),
+            headers=GCE_TPU_HEADERS,
+        )
+        if (accelerator_type_request.status_code == 200
+                and accelerator_type_request.text):
+            return accelerator_type_request.text
+        else:
+            logger.error(
+                "Unable to poll TPU GCE Metadata. Got "
+                f"status code: {accelerator_type_request.status_code} and "
+                f"content: {accelerator_type_request.text}")
+    except requests.RequestException as e:
+        logger.error("Unable to poll the TPU GCE Metadata: %s", e)
+    return None
+def get_tpu_type() -> str:
+    tpu_type = envs.TPU_ACCELERATOR_TYPE
+    if tpu_type is None:
+        tpu_type = get_tpu_metadata(key="accelerator-type")
+    return tpu_type
+def get_node_name() -> str:
+    tpu_name = envs.TPU_NAME
+    if not tpu_name:
+        tpu_name = get_tpu_metadata(key="instance-id")
+    return tpu_name
+def get_node_worker_id() -> int:
+    """For multi-host TPU VM, this returns the worker id for the current node."""
+    worker_id = envs.TPU_WORKER_ID
+    if worker_id is None:
+        worker_id = get_tpu_metadata(key="agent-worker-number")
+    if worker_id is None:
+        return 0
+    return int(worker_id)
+def get_num_cores_per_chip() -> int:
+    tpu_type = get_tpu_type()
+    if tpu_type.startswith(("v5litepod", "v6e")):
+        return 1
+    return 2
+def get_num_chips() -> int:
+    accel_files = glob.glob("/dev/accel*")
+    if accel_files:
+        return len(accel_files)
+    try:
+        vfio_entries = os.listdir("/dev/vfio")
+        numeric_entries = [
+            int(entry) for entry in vfio_entries if entry.isdigit()
+        ]
+        return len(numeric_entries)
+    except FileNotFoundError as e:
+        logger.error("Failed to detect number of TPUs: %s", e)
+        return 0