PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tpu_inference/runner/multimodal_manager.py ADDED Viewed

@@ -0,0 +1,231 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+import jax
+import jax.numpy as jnp
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.utils import group_mm_kwargs_by_modality
+from vllm.v1.core.sched.output import SchedulerOutput as VllmSchedulerOutput
+from vllm.v1.worker.utils import (gather_mm_placeholders,
+                                  scatter_mm_placeholders)
+from tpu_inference.models.jax.utils.multi_modal_utils import (
+    flatten_embeddings, sanity_check_mm_encoder_outputs)
+if TYPE_CHECKING:
+    from tpu_inference.runner.tpu_runner import TPUModelRunner
+class MultiModalManager:
+    def __init__(self, runner: "TPUModelRunner"):
+        self.runner = runner
+    def calc_mrope_positions(self, scheduler_output: "VllmSchedulerOutput"):
+        mrope_pos_ptr = 0
+        for index, req_id in enumerate(self.runner.input_batch.req_ids):
+            req = self.runner.requests[req_id]
+            assert req.mrope_positions is not None
+            num_computed_tokens = \
+                self.runner.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = \
+                scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = len(req.prompt_token_ids)
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0,
+                                      num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(
+                    0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+            if prompt_part_len > 0:
+                # prompt's mrope_positions are pre-computed
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+                self.runner.mrope_positions_cpu[:, dst_start:dst_end] = \
+                    req.mrope_positions[:,src_start:src_end]
+                mrope_pos_ptr += prompt_part_len
+            if completion_part_len > 0:
+                # compute completion's mrope_positions on-the-fly
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + completion_part_len
+                MRotaryEmbedding.get_next_input_positions_tensor(
+                    out=self.runner.mrope_positions_cpu,
+                    out_offset=dst_start,
+                    mrope_position_delta=req.mrope_position_delta,
+                    context_len=num_computed_tokens + prompt_part_len,
+                    num_new_tokens=completion_part_len,
+                )
+                mrope_pos_ptr += completion_part_len
+    def execute_mm_encoder(self, scheduler_output: "VllmSchedulerOutput"):
+        import torch
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+        # Batch the multi-modal inputs.
+        mm_kwargs = list[MultiModalKwargsItem]()
+        # List of tuple (mm_hash, pos_info)
+        mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.runner.requests[req_id]
+            for mm_input_id in encoder_input_ids:
+                mm_feature = req_state.mm_features[mm_input_id]
+                mm_hash = mm_feature.identifier
+                mm_kwargs.append(mm_feature.data)
+                mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
+        # Batch mm inputs as much as we can: if a request in the batch has
+        # multiple modalities or a different modality than the previous one,
+        # we process it separately to preserve item order.
+        # FIXME(ywang96): This is a hacky way to deal with multiple modalities
+        # in the same batch while still being able to benefit from batching
+        # multimodal inputs. The proper solution should be reordering the
+        # encoder outputs.
+        encoder_outputs = []
+        for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
+                mm_kwargs, merge_by_field_config=False):
+            batched_mm_inputs = mm_kwargs_group
+            # Convert torch tensors to numpy arrays that JAX can handle.
+            if "pixel_values" in batched_mm_inputs and isinstance(
+                    batched_mm_inputs["pixel_values"], list):
+                batched_mm_inputs["pixel_values"] = torch.cat(
+                    batched_mm_inputs["pixel_values"], dim=0)
+            image_grid_thw = ()
+            for key, value in batched_mm_inputs.items():
+                if isinstance(value, torch.Tensor):
+                    if key == 'image_grid_thw':
+                        # change it to tuple of tuples to make it hashable for JIT
+                        # Shape: (B, N, 3) -> (B*N, 3) -> tuple of tuples
+                        grid_thw_tensor = batched_mm_inputs[key]
+                        grid_thw_reshaped = grid_thw_tensor.reshape(-1, 3)
+                        image_grid_thw = tuple(
+                            tuple(row) for row in grid_thw_reshaped.tolist())
+                        continue
+                    if value.dtype == torch.bfloat16:
+                        batched_mm_inputs[key] = value.to(
+                            torch.float32).numpy().astype(jnp.bfloat16)
+                    else:
+                        batched_mm_inputs[key] = value.numpy()
+            batched_mm_inputs.pop('image_grid_thw')
+            # Run the encoder.
+            # `curr_group_outputs` is either of the following:
+            # 1. A tensor of shape (num_items, feature_size, hidden_size)
+            # in case feature_size is fixed across all multimodal items.
+            # 2. A list or tuple (length: num_items) of tensors, each of shape
+            # (feature_size, hidden_size) in case the feature size is dynamic
+            # depending on the input multimodal items.
+            curr_group_outputs = self.runner.get_multimodal_embeddings_fn(
+                self.runner.state, image_grid_thw, **batched_mm_inputs)
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=num_items,
+            )
+            for output in curr_group_outputs:
+                encoder_outputs.append(output)
+        # Cache the encoder outputs.
+        for (mm_hash, pos_info), output in zip(
+                mm_hashes_pos,
+                encoder_outputs,
+        ):
+            if req_id not in self.runner.encoder_cache:
+                self.runner.encoder_cache[req_id] = {}
+            self.runner.encoder_cache[mm_hash] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+    def gather_mm_embeddings(self, scheduler_output: "VllmSchedulerOutput",
+                             target_pad_len: int) -> list[jax.Array]:
+        mm_embeds: list[jax.Array] = []
+        for req_id in self.runner.input_batch.req_ids:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.runner.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_features = req_state.mm_features
+            for _, mm_feature in enumerate(mm_features):
+                pos_info = mm_feature.mm_position
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens)
+                assert start_idx < end_idx
+                mm_hash = mm_feature.identifier
+                encoder_output = self.runner.encoder_cache.get(mm_hash, None)
+                assert encoder_output is not None,\
+                      f"Encoder cache miss for {mm_hash}."
+                encoder_output = self.runner.encoder_cache[mm_hash]
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        if not mm_embeds:
+            return None
+        flattened_embeds = flatten_embeddings(mm_embeds)
+        if flattened_embeds.shape[0] == 0:
+            return None
+        padding = jnp.zeros((target_pad_len - flattened_embeds.shape[0],
+                             flattened_embeds.shape[1]),
+                            dtype=flattened_embeds.dtype)
+        flattened_embeds = jnp.concatenate([flattened_embeds, padding], axis=0)
+        return flattened_embeds

tpu_inference/runner/persistent_batch_manager.py ADDED Viewed

@@ -0,0 +1,296 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict
+import jax
+from vllm.v1.core.sched.output import SchedulerOutput as VllmSchedulerOutput
+from tpu_inference.logger import init_logger
+from tpu_inference.runner.input_batch import CachedRequestState, InputBatch
+logger = init_logger(__name__)
+class PersistentBatchManager:
+    def __init__(self, requests: Dict[str, CachedRequestState],
+                 input_batch: InputBatch, encoder_cache: Dict[str,
+                                                              'jax.Array'],
+                 uses_mrope: bool, model_config, is_last_rank: bool):
+        self.requests = requests
+        self.input_batch = input_batch
+        self.encoder_cache = encoder_cache
+        self.uses_mrope = uses_mrope
+        self.model_config = model_config
+        self.is_last_rank = is_last_rank
+    def _reorder_batch(self, scheduler_output: "VllmSchedulerOutput") -> int:
+        """ Reorder the sheduled requests to RPA kernel friendly distribution
+        (decode_only, fixed_chunked_prefill_only, mixed) and set the request
+        distribution accordingly.
+        Returns:
+            The number of swaps in requests.
+        """
+        # Note(jevinjiang): currently we only consider decode_only.
+        num_reqs = self.input_batch.num_reqs
+        swap_cnt = 0
+        if num_reqs <= 0:
+            return swap_cnt
+        # Use two-pointer approach to reorder the decode requests to front.
+        i, j = 0, num_reqs - 1
+        while i < j:
+            i_req_id = self.input_batch.req_ids[i]
+            j_req_id = self.input_batch.req_ids[j]
+            if scheduler_output.num_scheduled_tokens[i_req_id] == 1:
+                # i is a decode request, move to the next one.
+                i += 1
+            elif scheduler_output.num_scheduled_tokens[j_req_id] > 1:
+                # j is a prefill request, move to the previous one.
+                j -= 1
+            else:
+                # Swap i and j.
+                self.input_batch.swap_states(i, j)
+                i += 1
+                j -= 1
+                swap_cnt += 1
+        num_decode = i + int(scheduler_output.num_scheduled_tokens[
+            self.input_batch.req_ids[i]] == 1)
+        self.input_batch.request_distribution = [
+            num_decode, num_decode, num_reqs
+        ]
+        return swap_cnt
+    def update_states(self, scheduler_output: "VllmSchedulerOutput",
+                      get_mrope_input_positions_fn) -> bool:
+        """Update the cached states and the persistent batch with the scheduler
+        output.
+        The updated states are used by the `_prepare_inputs` function to create
+        the input TPU tensors for the model.
+        Returns:
+            True if there is a new/resumed/paused/finished request.
+            If False, we can skip copying SamplingMetadata to the TPU.
+        """
+        # Remove finished requests from the cached states.
+        for req_id in scheduler_output.finished_req_ids:
+            self.requests.pop(req_id, None)
+        # Remove the finished requests from the persistent batch.
+        # NOTE(woosuk): There could be an edge case where finished_req_ids and
+        # scheduled_req_ids overlap. This happens when a request is aborted and
+        # then resubmitted with the same ID. In this case, we treat them as two
+        # distinct requests - clearing the cached states for the first request
+        # and handling the second as a new request.
+        removed_req_indices: list[int] = []
+        for req_id in scheduler_output.finished_req_ids:
+            req_index = self.input_batch.remove_request(req_id)
+            if req_index is not None:
+                removed_req_indices.append(req_index)
+        # Free the cached encoder outputs.
+        for mm_hash in scheduler_output.free_encoder_mm_hashes:
+            self.encoder_cache.pop(mm_hash, None)
+        # Remove the unscheduled requests from the persistent batch.
+        # NOTE(woosuk): The unscheduled requests are either preempted requests
+        # or running requests that are not scheduled in this step. We remove
+        # them from the persistent batch but keep their cached states since
+        # they will be scheduled again sometime in the future.
+        scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
+        cached_req_ids = self.input_batch.req_id_to_index.keys()
+        unscheduled_req_ids = cached_req_ids - scheduled_req_ids
+        # NOTE(woosuk): The persistent batch optimization assumes that
+        # consecutive batches contain mostly the same requests. If batches
+        # have low request overlap (e.g., alternating between two distinct
+        # sets of requests), this optimization becomes very inefficient.
+        for req_id in unscheduled_req_ids:
+            req_index = self.input_batch.remove_request(req_id)
+            assert req_index is not None
+            removed_req_indices.append(req_index)
+        req_ids_to_add: list[str] = []
+        # Add new requests to the cached states.
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            sampling_params = new_req_data.sampling_params
+            self.requests[req_id] = CachedRequestState(
+                req_id=req_id,
+                prompt_token_ids=new_req_data.prompt_token_ids,
+                mm_features=new_req_data.mm_features,
+                sampling_params=sampling_params,
+                pooling_params=None,
+                generator=None,
+                block_ids=new_req_data.block_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
+                output_token_ids=[],
+                lora_request=new_req_data.lora_request,
+            )
+            req_ids_to_add.append(req_id)
+            # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+            if self.uses_mrope:
+                image_grid_thw = []
+                video_grid_thw = []
+                second_per_grid_ts = []
+                audio_feature_lengths = []
+                use_audio_in_video = False
+                for mm_feature in self.requests[req_id].mm_features:
+                    item = mm_feature.data
+                    if item is None:
+                        continue
+                    mm_input = item.get_data()
+                    if mm_input.get("image_grid_thw") is not None:
+                        image_grid_thw.append(
+                            mm_input["image_grid_thw"].tolist())
+                    if mm_input.get("video_grid_thw") is not None:
+                        video_grid_thw.append(
+                            mm_input["video_grid_thw"].tolist())
+                    if mm_input.get("second_per_grid_ts") is not None:
+                        second_per_grid_ts.append(
+                            mm_input["second_per_grid_ts"])
+                    if mm_input.get("audio_feature_lengths") is not None:
+                        audio_feature_lengths.append(
+                            mm_input["audio_feature_lengths"])
+                    if mm_input.get("use_audio_in_video") is True:
+                        use_audio_in_video = True
+                hf_config = self.model_config.hf_config
+                self.requests[req_id].mrope_positions, self.requests[
+                    req_id].mrope_position_delta = get_mrope_input_positions_fn(
+                        self.requests[req_id].prompt_token_ids,
+                        hf_config=hf_config,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        second_per_grid_ts=second_per_grid_ts,
+                        audio_feature_lengths=audio_feature_lengths,
+                        use_audio_in_video=use_audio_in_video,
+                    )
+        # Update the states of the running/resumed requests.
+        req_data = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(req_data.req_ids):
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_data.num_computed_tokens[i]
+            new_block_ids = req_data.new_block_ids[i]
+            resumed_from_preemption = req_data.resumed_from_preemption[i]
+            num_output_tokens = req_data.num_output_tokens[i]
+            # Update the cached states.
+            req_state.num_computed_tokens = num_computed_tokens
+            req_index = self.input_batch.req_id_to_index.get(req_id)
+            if not self.is_last_rank:
+                # When using PP, the scheduler sends the sampled tokens back,
+                # because there's no direct communication between the first-
+                # stage worker and the last-stage worker.
+                new_token_ids = req_data.new_token_ids[i]
+                # Add the sampled token(s) from the previous step (if any).
+                # This doesn't include "unverified" tokens like spec tokens.
+                num_new_tokens = (num_computed_tokens + len(new_token_ids) -
+                                  req_state.num_tokens)
+                if num_new_tokens == 1:
+                    req_state.output_token_ids.append(new_token_ids[-1])
+                elif num_new_tokens > 0:
+                    req_state.output_token_ids.extend(
+                        new_token_ids[-num_new_tokens:])
+            elif num_output_tokens < len(req_state.output_token_ids):
+                del req_state.output_token_ids[num_output_tokens:]
+                if req_index is not None:
+                    end_idx = (self.input_batch.num_prompt_tokens[req_index] +
+                               num_output_tokens)
+                    self.input_batch.num_tokens[req_index] = end_idx
+                    self.input_batch.num_tokens_no_spec[req_index] = end_idx
+            # Update the block IDs.
+            if not resumed_from_preemption:
+                if new_block_ids is not None:
+                    # Append the new blocks to the existing block IDs.
+                    for block_ids, new_ids in zip(req_state.block_ids,
+                                                  new_block_ids):
+                        block_ids.extend(new_ids)
+            else:
+                assert new_block_ids is not None
+                # The request is resumed from preemption.
+                # Replace the existing block IDs with the new ones.
+                req_state.block_ids = new_block_ids
+            if req_index is None:
+                # The request is not in the persistent batch.
+                # The request was either preempted and resumed later, or was not
+                # scheduled in the previous step and needs to be added again.
+                req_ids_to_add.append(req_id)
+                continue
+            # Update the persistent batch.
+            self.input_batch.num_computed_tokens_cpu[
+                req_index] = num_computed_tokens
+            if new_block_ids is not None:
+                self.input_batch.block_table.append_row(
+                    new_block_ids, req_index)
+            # For the last rank, we don't need to update the token_ids_cpu
+            # because the sampled tokens are already cached.
+            if not self.is_last_rank:
+                start_token_index = num_computed_tokens
+                end_token_index = num_computed_tokens + len(new_token_ids)
+                self.input_batch.token_ids_cpu[
+                    req_index,
+                    start_token_index:end_token_index] = new_token_ids
+                self.input_batch.num_tokens_no_spec[
+                    req_index] = end_token_index
+                self.input_batch.num_tokens[req_index] = end_token_index
+            # Add spec_token_ids to token_ids_cpu.
+            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
+                req_id, ())
+            if spec_token_ids:
+                num_spec_tokens = len(spec_token_ids)
+                start_index = self.input_batch.num_tokens_no_spec[req_index]
+                end_token_index = start_index + num_spec_tokens
+                self.input_batch.token_ids_cpu[
+                    req_index, start_index:end_token_index] = spec_token_ids
+                # NOTE(woosuk): `num_tokens` here may include spec tokens.
+                self.input_batch.num_tokens[req_index] += num_spec_tokens
+        # Add the new or resumed requests to the persistent batch.
+        # The smaller empty indices are filled first.
+        removed_req_indices = sorted(removed_req_indices, reverse=True)
+        for req_id in req_ids_to_add:
+            req_state = self.requests[req_id]
+            if removed_req_indices:
+                # Fill the empty index.
+                req_index = removed_req_indices.pop()
+            else:
+                # Append to the end.
+                req_index = None
+            self.input_batch.add_request(req_state, req_index)
+        # Condense the batched states if there are empty indices.
+        if removed_req_indices:
+            self.input_batch.condense(removed_req_indices)
+        batch_changed = len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
+        # TODO(jevinjiang): I assume we do not need to set batch_changed to true if just swapping requests.
+        self._reorder_batch(scheduler_output)
+        return batch_changed