PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (59) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/kernels/mla_v1_test.py +129 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +3 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +3 -1
tests/lora/test_layers.py +4 -1
tests/lora/test_lora_perf.py +53 -0
tests/test_envs.py +110 -12
tests/test_quantization.py +3 -0
tests/test_utils.py +1 -2
tpu_inference/distributed/tpu_connector.py +1 -1
tpu_inference/envs.py +92 -8
tpu_inference/executors/ray_distributed_executor.py +5 -1
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/fused_moe/v1/kernel.py +712 -143
tpu_inference/kernels/mla/v1/kernel.py +98 -120
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +82 -32
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +146 -85
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/util.py +2 -1
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +11 -7
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +232 -64
tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +170 -208
tpu_inference/layers/vllm/linear_common.py +43 -21
tpu_inference/layers/vllm/quantization/common.py +11 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +74 -65
tpu_inference/layers/vllm/quantization/mxfp4.py +140 -94
tpu_inference/layers/vllm/quantization/unquantized.py +103 -80
tpu_inference/models/common/model_loader.py +78 -22
tpu_inference/models/jax/deepseek_v3.py +185 -64
tpu_inference/models/jax/gpt_oss.py +3 -3
tpu_inference/models/jax/llama_eagle3.py +4 -5
tpu_inference/models/jax/qwen2_5_vl.py +161 -47
tpu_inference/models/jax/utils/quantization/quantization_utils.py +7 -8
tpu_inference/models/jax/utils/weight_utils.py +203 -155
tpu_inference/models/vllm/vllm_model_wrapper.py +11 -5
tpu_inference/platforms/tpu_platform.py +29 -48
tpu_inference/runner/compilation_manager.py +112 -46
tpu_inference/runner/kv_cache.py +40 -20
tpu_inference/runner/kv_cache_manager.py +40 -31
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +94 -51
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -22
tpu_inference/utils.py +41 -14
tpu_inference/worker/tpu_worker.py +43 -45
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/METADATA +8 -9
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/RECORD +59 -58
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/top_level.txt +0 -0

tpu_inference/runner/kv_cache_manager.py CHANGED Viewed

@@ -7,8 +7,8 @@ import numpy as np
 import vllm.envs as envs
 from jax.sharding import NamedSharding, PartitionSpec
 from torchax.ops.mappings import t2j_dtype
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.config import get_layers_from_vllm_config
 from vllm.utils.math_utils import cdiv
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -39,20 +39,30 @@ class KVCacheManager:
         # means this layer will perform attention using the keys and values
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
+        self.use_mla = self.runner.model_config.use_mla
     def get_kv_cache_spec(self):
         # TODO(xiang): this hack tricks engine core to init successfully
         block_size = self.runner.cache_config.block_size
-        use_mla = self.runner.model_config.use_mla
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         # If use pure jax (MODEL_IMPL_TYPE=flax_nnx), we don't register
         # attention into compilation config.
         # Use FullAttentionSpec for each layer
         # TODO(pooyam): Is it possible to merge the logic for vllm and non-vllm models?
+        model_config = self.runner.model_config
+        if self.use_mla:
+            # Individually pad the RopE and latents
+            qk_rope_head_dim = getattr(model_config.hf_text_config,
+                                       "qk_rope_head_dim", 0)
+            padded_kv_lora_rank = common_utils.align_to(
+                model_config.hf_text_config.kv_lora_rank, 128)
+            padded_qk_rope_head_dim = common_utils.align_to(
+                qk_rope_head_dim, 128)
+            mla_head_size = padded_kv_lora_rank + padded_qk_rope_head_dim
         if len(self.runner.vllm_config.compilation_config.
                static_forward_context) == 0:
-            model_config = self.runner.model_config
             parallel_config = self.runner.parallel_config
             # Pad num_kv_heads to multiple of TP size.
             num_kv_heads = common_utils.get_padded_num_heads(
@@ -61,11 +71,11 @@ class KVCacheManager:
             head_size = common_utils.get_padded_head_dim(
                 model_config.get_head_size())
             for i in range(model_config.get_num_layers(parallel_config)):
-                if use_mla:
+                if self.use_mla:
                     kv_cache_spec[f"layer.{i}"] = MLAAttentionSpec(
                         block_size=block_size,
-                        num_kv_heads=num_kv_heads,
-                        head_size=head_size,
+                        num_kv_heads=1,
+                        head_size=mla_head_size,
                         dtype=self.runner.kv_cache_dtype,
                         cache_dtype_str=self.runner.vllm_config.cache_config.
                         cache_dtype)
@@ -83,14 +93,13 @@ class KVCacheManager:
                     self.runner.mesh.shape["model"])
                 head_size = common_utils.get_padded_head_dim(
                     hf_config.hidden_size // hf_config.num_attention_heads)
                 # Eagle3 has only 1 layer
                 for i in range(1):
-                    if use_mla:
-                        kv_cache_spec[f"layer.{i}"] = MLAAttentionSpec(
+                    if self.use_mla:
+                        kv_cache_spec[f"draft_layer.{i}"] = MLAAttentionSpec(
                             block_size=block_size,
-                            num_kv_heads=num_kv_heads,
-                            head_size=head_size,
+                            num_kv_heads=1,
+                            head_size=mla_head_size,
                             dtype=self.runner.kv_cache_dtype,
                             cache_dtype_str=self.runner.vllm_config.
                             cache_config.cache_dtype)
@@ -104,6 +113,7 @@ class KVCacheManager:
             # Else propagate attention modules from compilation config.
             layers = get_layers_from_vllm_config(self.runner.vllm_config,
                                                  Attention)
+            logger.warning(f"Compilation num_layers = {len(layers.items())}")
             for layer_name, attn_module in layers.items():
                 if (kv_tgt_layer :=
                         attn_module.kv_sharing_target_layer_name) is not None:
@@ -127,11 +137,11 @@ class KVCacheManager:
                                 attn_module.head_size),
                             dtype=self.runner.kv_cache_dtype,
                             sliding_window=attn_module.sliding_window)
-                    elif use_mla:
-                        kv_cache_spec[f"layer.{i}"] = MLAAttentionSpec(
+                    elif self.use_mla:
+                        kv_cache_spec[layer_name] = MLAAttentionSpec(
                             block_size=block_size,
-                            num_kv_heads=attn_module.num_kv_heads,
-                            head_size=attn_module.head_size,
+                            num_kv_heads=1,
+                            head_size=mla_head_size,
                             dtype=self.runner.kv_cache_dtype,
                             cache_dtype_str=self.runner.vllm_config.
                             cache_config.cache_dtype)
@@ -198,14 +208,20 @@ class KVCacheManager:
             # num_blocks must be a multiple of dp_size
             num_blocks = (num_blocks // dp_size) * dp_size
             # NOTE: we'll multiply the num_kv_heads by 2 in the function
+            if self.use_mla:
+                head_size = self.runner.model_config.hf_config.kv_lora_rank + \
+                    self.runner.model_config.hf_config.qk_rope_head_dim
+            else:
+                head_size = representative_spec.head_size
             kv_cache = create_kv_caches(
                 num_blocks=num_blocks,
                 block_size=representative_spec.block_size,
                 num_kv_heads=representative_spec.num_kv_heads,
-                head_size=representative_spec.head_size,
+                head_size=head_size,
                 mesh=self.runner.mesh,
                 layer_names=[f'kv_cache_tensor.{i}'],
                 cache_dtype=t2j_dtype(representative_spec.dtype),
+                use_mla=self.use_mla,
             )[0]
             kv_caches.append(kv_cache)
             num_blocks_list.append(num_blocks)
@@ -289,13 +305,8 @@ class KVCacheManager:
         def _update_layer(cache, slices):
             """The function to apply to each layer's cache and slices."""
-            reshaped_slices = slices.reshape(-1, 1, block_size,
-                                             *slices.shape[1:])
-            for (i, block_idx) in enumerate(block_numbers):
-                cache = jax.lax.dynamic_update_slice_in_dim(cache,
-                                                            reshaped_slices[i],
-                                                            block_idx,
-                                                            axis=0)
+            reshaped_slices = slices.reshape(-1, block_size, *slices.shape[1:])
+            cache.at[block_numbers].set(reshaped_slices)
             return cache
         return jax.tree.map(_update_layer, kv_caches, kv_cache_slices)
@@ -348,16 +359,12 @@ class KVCacheManager:
         """
         if block_ids == list(range(block_ids[0],
                                    block_ids[0] + len(block_ids))):
-            with runner_utils.LatencyTracker(
-                    "BatchedGatherKVSlices-for-blocks"):
-                batched_kv_cache_per_layer = self._jitted_gather_continuous_kv_cache(
-                    self.runner.kv_caches, block_ids[0], len(block_ids))
+            batched_kv_cache_per_layer = self._jitted_gather_continuous_kv_cache(
+                self.runner.kv_caches, block_ids[0], len(block_ids))
         else:
-            with runner_utils.LatencyTracker(
-                    "BatchedGatherKVSlices-for-blocks"):
-                batched_kv_cache_per_layer = self._jitted_gather_kv_cache(
-                    self.runner.kv_caches, jnp.array(block_ids))
+            batched_kv_cache_per_layer = self._jitted_gather_kv_cache(
+                self.runner.kv_caches, jnp.array(block_ids))
         return batched_kv_cache_per_layer
     def transfer_kv_cache(self,
@@ -446,6 +453,7 @@ class KVCacheManager:
                     kv_cache_slices,
                     start_block,
                 )
+                jax.block_until_ready(self.runner.kv_caches)
         else:
             with runner_utils.LatencyTracker(
                     f"JittedInsertKVCache-b{len(block_numbers)}"):
@@ -457,6 +465,7 @@ class KVCacheManager:
                     kv_cache_slices,
                     jnp.array(block_numbers),
                 )
+                jax.block_until_ready(self.runner.kv_caches)
         logger.debug(
             f"Updated kv cache entries cnt={len(self.runner.kv_caches)}")

tpu_inference/runner/persistent_batch_manager.py CHANGED Viewed

@@ -14,12 +14,13 @@ class PersistentBatchManager:
     def __init__(self, requests: Dict[str, CachedRequestState],
                  input_batch: InputBatch, encoder_cache: Dict[str,
                                                               'jax.Array'],
-                 uses_mrope: bool, model_config):
+                 uses_mrope: bool, model_config, is_last_rank: bool):
         self.requests = requests
         self.input_batch = input_batch
         self.encoder_cache = encoder_cache
         self.uses_mrope = uses_mrope
         self.model_config = model_config
+        self.is_last_rank = is_last_rank
     def _reorder_batch(self, scheduler_output: "VllmSchedulerOutput") -> int:
         """ Reorder the sheduled requests to RPA kernel friendly distribution
@@ -179,9 +180,35 @@ class PersistentBatchManager:
             num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_data.resumed_from_preemption[i]
+            num_output_tokens = req_data.num_output_tokens[i]
             # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
+            req_index = self.input_batch.req_id_to_index.get(req_id)
+            if not self.is_last_rank:
+                # When using PP, the scheduler sends the sampled tokens back,
+                # because there's no direct communication between the first-
+                # stage worker and the last-stage worker.
+                new_token_ids = req_data.new_token_ids[i]
+                # Add the sampled token(s) from the previous step (if any).
+                # This doesn't include "unverified" tokens like spec tokens.
+                num_new_tokens = (num_computed_tokens + len(new_token_ids) -
+                                  req_state.num_tokens)
+                if num_new_tokens == 1:
+                    req_state.output_token_ids.append(new_token_ids[-1])
+                elif num_new_tokens > 0:
+                    req_state.output_token_ids.extend(
+                        new_token_ids[-num_new_tokens:])
+            elif num_output_tokens < len(req_state.output_token_ids):
+                del req_state.output_token_ids[num_output_tokens:]
+                if req_index is not None:
+                    end_idx = (self.input_batch.num_prompt_tokens[req_index] +
+                               num_output_tokens)
+                    self.input_batch.num_tokens[req_index] = end_idx
+                    self.input_batch.num_tokens_no_spec[req_index] = end_idx
+            # Update the block IDs.
             if not resumed_from_preemption:
                 if new_block_ids is not None:
                     # Append the new blocks to the existing block IDs.
@@ -194,7 +221,6 @@ class PersistentBatchManager:
                 # Replace the existing block IDs with the new ones.
                 req_state.block_ids = new_block_ids
-            req_index = self.input_batch.req_id_to_index.get(req_id)
             if req_index is None:
                 # The request is not in the persistent batch.
                 # The request was either preempted and resumed later, or was not
@@ -209,6 +235,18 @@ class PersistentBatchManager:
                 self.input_batch.block_table.append_row(
                     new_block_ids, req_index)
+            # For the last rank, we don't need to update the token_ids_cpu
+            # because the sampled tokens are already cached.
+            if not self.is_last_rank:
+                start_token_index = num_computed_tokens
+                end_token_index = num_computed_tokens + len(new_token_ids)
+                self.input_batch.token_ids_cpu[
+                    req_index,
+                    start_token_index:end_token_index] = new_token_ids
+                self.input_batch.num_tokens_no_spec[
+                    req_index] = end_token_index
+                self.input_batch.num_tokens[req_index] = end_token_index
             # Add spec_token_ids to token_ids_cpu.
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
                 req_id, ())

tpu_inference/runner/structured_decoding_manager.py CHANGED Viewed

@@ -61,11 +61,10 @@ class StructuredDecodingManager:
         self.runner.require_structured_out_cpu.fill(0)
         sorted_struct_requests = sorted(
-            grammar_output.structured_output_request_ids.items(),
-            key=lambda item: item[1])
+            grammar_output.structured_output_request_ids)
         cumulative_mask_idx = 0
-        for req_id, _ in sorted_struct_requests:
+        for req_id in sorted_struct_requests:
             if req_id not in self.runner.input_batch.req_id_to_index:
                 continue
             batch_index = self.runner.input_batch.req_id_to_index[req_id]

tpu_inference/runner/tpu_runner.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import copy
 import functools
-import os
+import logging
 import random
 from contextlib import nullcontext
 from dataclasses import dataclass
@@ -10,17 +10,15 @@ import jax
 import jax.numpy as jnp
 import jaxtyping
 import numpy as np
-import torch
-import vllm.envs as envs
+import vllm.envs as vllm_envs
 from flax import nnx
 from jax.experimental import mesh_utils
 from jax.sharding import NamedSharding, PartitionSpec
-from torchax.ops.mappings import j2t_dtype
 from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
 from vllm.forward_context import set_forward_context
-from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
 from vllm.utils.math_utils import cdiv
 from vllm.v1.core.sched.output import GrammarOutput
@@ -35,6 +33,7 @@ from vllm.v1.worker.kv_connector_model_runner_mixin import \
     KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+import tpu_inference.envs as envs
 from tpu_inference import utils as common_utils
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
 from tpu_inference.layers.common.sharding import (MESH_AXIS_NAMES,
@@ -48,6 +47,8 @@ from tpu_inference.layers.jax.sample.sampling_metadata import \
     TPUSupportedSamplingMetadata
 from tpu_inference.logger import init_logger
 from tpu_inference.models.common.model_loader import get_model
+from tpu_inference.models.jax.jax_intermediate_tensor import \
+    JaxIntermediateTensors
 from tpu_inference.models.jax.utils.weight_utils import (
     shard_put, transfer_state_with_mappings)
 from tpu_inference.runner import utils as runner_utils
@@ -64,10 +65,12 @@ from tpu_inference.runner.structured_decoding_manager import \
     StructuredDecodingManager
 from tpu_inference.spec_decode.jax.eagle3 import Eagle3Proposer
 from tpu_inference.utils import (device_array, make_optimized_mesh,
-                                 time_function)
+                                 time_function, to_jax_dtype, to_torch_dtype)
 logger = init_logger(__name__)
+logging.getLogger("torchax.tensor").setLevel(logging.ERROR)
 INVALID_TOKEN_ID = -1
 # Smallest output size
 MIN_NUM_SEQS = 8
@@ -78,17 +81,6 @@ DUMMY_METADATA = AttentionMetadata(
     request_distribution=[0, 0, 0],
 )
-TPU_STR_DTYPE_TO_TORCH_DTYPE = {
-    "half": torch.half,
-    "bfloat16": torch.bfloat16,
-    "float": torch.float,
-    "fp8": torch.float8_e4m3fn,
-    "fp8_e4m3": torch.float8_e4m3fn,
-    "fp8_e5m2": torch.float8_e5m2,
-    "int8": torch.int8,
-    "uint8": torch.uint8,
-}
 class AsyncTPUModelRunnerOutput(AsyncModelRunnerOutput):
     """Holds asynchronous model output specifically from a TPU runner.
@@ -243,6 +235,9 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         self.maybe_forbid_compile = runner_utils.ForbidCompile(
         ) if envs.VLLM_XLA_CHECK_RECOMPILATION else nullcontext()
         self.dp_size = self.vllm_config.sharding_config.total_dp_size
+        self.rank = rank
+        self.is_first_rank = is_first_rank
+        self.is_last_rank = is_last_rank
         self._init_random()
         self._init_mesh()
@@ -253,31 +248,21 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         # Delegate functions to specific manager classes.
         self.compilation_manager = CompilationManager(self)
-        self.speculative_decoding_manager = SpeculativeDecodingManager(self)
-        self.structured_decoding_manager = StructuredDecodingManager(self)
+        if self.is_last_rank:
+            self.speculative_decoding_manager = SpeculativeDecodingManager(
+                self)
+            self.structured_decoding_manager = StructuredDecodingManager(self)
         self.kv_cache_manager = KVCacheManager(self)
         self.mm_manager = MultiModalManager(self)
         self.persistent_batch_manager = PersistentBatchManager(
             self.requests, self.input_batch, self.encoder_cache,
-            self.uses_mrope, self.model_config)
+            self.uses_mrope, self.model_config, self.is_last_rank)
         self.lora_utils = LoraUtils(self)
-        cache_config = self.cache_config
-        if cache_config.cache_dtype == "auto":
-            model_dtype = self.dtype
-            if isinstance(model_dtype, str):
-                self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
-            elif isinstance(getattr(model_dtype, 'dtype', None), jnp.dtype):
-                self.kv_cache_dtype = j2t_dtype(model_dtype.dtype)
-            elif isinstance(model_dtype, torch.dtype):
-                self.kv_cache_dtype = model_dtype
-            else:
-                raise ValueError(
-                    "KV cache is unsupported for model_dtype of %s",
-                    model_dtype)
-        else:
-            self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[
-                cache_config.cache_dtype]
+        cache_dtype = self.cache_config.cache_dtype
+        if cache_dtype == "auto":
+            cache_dtype = self.dtype
+        self.kv_cache_dtype = to_torch_dtype(cache_dtype)
         self._pre_async_results: AsyncPreResults | None = None
         self._substitute_placeholder_token_fn = _substitute_placeholder_token
@@ -291,7 +276,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         self.rng_key = jax.random.key(self.model_config.seed)
     def _init_mesh(self) -> None:
-        if os.getenv("NEW_MODEL_DESIGN", False):
+        if envs.NEW_MODEL_DESIGN:
             self.mesh = self._create_new_model_mesh()
         else:
             # NOTE(wenxindongwork): The new MoE kernel expects a 2D mesh, so we need
@@ -302,7 +287,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         logger.info(f"Init mesh | mesh={self.mesh}")
     def _create_new_model_mesh(self) -> jax.sharding.Mesh:
-        num_slices = int(os.environ.get('NUM_SLICES', 1))
+        num_slices = envs.NUM_SLICES
         logger.info(f"Creating new model mesh | devices={len(self.devices)}, "
                     f"num_slices={num_slices}")
@@ -371,7 +356,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
                                        devices=self.devices)
     def _init_phased_profiling(self) -> None:
-        self.phased_profiling_dir = os.getenv("PHASED_PROFILING_DIR", "")
+        self.phased_profiling_dir = envs.PHASED_PROFILING_DIR
         self.phase_based_profiler = None
         if self.phased_profiling_dir:
             self.phase_based_profiler = runner_utils.PhasedBasedProfiler(
@@ -413,7 +398,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
             min_token_size=max(16, self.dp_size),
             max_token_size=scheduler_config.max_num_batched_tokens *
             self.dp_size,
-            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
+            padding_gap=vllm_envs.VLLM_TPU_BUCKET_PADDING_GAP)
         self.num_tokens_paddings_per_dp = [
             padding // self.dp_size for padding in self.num_tokens_paddings
         ]
@@ -555,12 +540,12 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
     def execute_model(
         self,
         scheduler_output: "VllmSchedulerOutput",
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> ModelRunnerOutput | None:
+        intermediate_tensors: Optional[JaxIntermediateTensors] = None,
+    ) -> ModelRunnerOutput | JaxIntermediateTensors | None:
         if self.execute_model_state is not None:
             raise RuntimeError("State error: sample_tokens() must be called "
                                "after execute_model() returns None.")
-        _, output = self._execute_model(scheduler_output)
+        _, output = self._execute_model(scheduler_output, intermediate_tensors)
         return output
     def sample_tokens(
@@ -686,7 +671,9 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
     def _execute_model(
         self,
         scheduler_output: "VllmSchedulerOutput",
-    ) -> tuple[AttentionMetadata, ModelRunnerOutput | None]:
+        intermediate_tensors: Optional[JaxIntermediateTensors] = None,
+    ) -> tuple[AttentionMetadata, JaxIntermediateTensors | ModelRunnerOutput
+               | None]:
         self.persistent_batch_manager.update_states(
             scheduler_output, self.get_mrope_input_positions_fn)
         if not scheduler_output.total_num_scheduled_tokens:
@@ -764,7 +751,6 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
                     scheduler_output) as kv_connector_output:
                 # NOTE(Wenlong): It takes both `input_ids` and `inputs_embeds`,
                 # but one of them would be `None`
                 (self.kv_caches, hidden_states,
                  aux_hidden_states) = self.model_fn(
                      self.state,
@@ -775,8 +761,14 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
                      input_positions,
                      tuple(self.layer_name_to_kvcache_index.items()),
                      lora_metadata,
+                     intermediate_tensors,
+                     self.is_first_rank,
+                     self.is_last_rank,
                  )
+            if not get_pp_group().is_last_rank:
+                assert isinstance(hidden_states, JaxIntermediateTensors)
+                hidden_states.kv_connector_output = kv_connector_output
+                return attn_metadata, hidden_states
             hidden_states = self._select_from_array_fn(hidden_states,
                                                        logits_indices)
             logits = self.compute_logits_fn(
@@ -822,18 +814,31 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         tpu_sampling_metadata = TPUSupportedSamplingMetadata.from_input_batch(
             self.mesh, self.input_batch, padded_num_reqs, sharding=sharding)
+        # TODO(pooyam): Should we move this to `_prepare_inputs`?
+        if tpu_sampling_metadata.do_sampling:
+            self.rng_params_for_sampling, step_rng = jax.random.split(
+                self.rng_params_for_sampling)
+        else:
+            step_rng = self.rng_params_for_sampling
         if spec_decode_metadata is None:
             next_tokens = sample(
-                self.rng_params_for_sampling,
+                step_rng,
                 self.mesh,
                 logits,
                 tpu_sampling_metadata,
             )
         else:
+            if tpu_sampling_metadata.do_sampling:
+                bonus_rng, rejection_rng = jax.random.split(step_rng)
+            else:
+                bonus_rng = step_rng
+                rejection_rng = step_rng
             bonus_logits = self._select_from_array_fn(
                 logits, spec_decode_metadata.bonus_logits_indices)
             bonus_token_ids = sample(
-                self.rng_params_for_sampling,
+                bonus_rng,
                 self.mesh,
                 bonus_logits,
                 tpu_sampling_metadata,
@@ -847,7 +852,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
                 target_logits=target_logits,
                 bonus_token_ids=bonus_token_ids,
                 sampling_metadata=tpu_sampling_metadata,
-                key=self.rng_params_for_sampling,
+                key=rejection_rng,
             )
         if tpu_sampling_metadata.logprobs:
@@ -1332,7 +1337,14 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         _request_distribution = []
         for dp_rank in range(dp_size):
             _num_reqs = num_req_per_dp_rank[dp_rank]
-            _request_distribution.append([0, 0, _num_reqs])
+            # The batch has been reordered by _reorder_batch so decode requests come first
+            # Count decode requests (those with num_scheduled_tokens == 1) in this DP rank
+            num_decode_in_dp_rank = 0
+            for req_id in req_ids_dp[dp_rank]:
+                if scheduler_output.num_scheduled_tokens[req_id] == 1:
+                    num_decode_in_dp_rank += 1
+            _request_distribution.append(
+                [num_decode_in_dp_rank, num_decode_in_dp_rank, _num_reqs])
         request_distribution = np.array(_request_distribution).ravel()
         use_spec_decode = len(
@@ -1391,7 +1403,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
                 block_tables[
                     req_offset:req_offset + _num_reqs, :self.
                     max_num_blocks_per_req] = self.input_batch.block_table[
-                        0].get_cpu_tensor()[req_indices_dp[dp_rank]]
+                        kv_cache_gid].get_cpu_tensor()[req_indices_dp[dp_rank]]
             # Convert block_tables to 1D on cpu.
             block_tables = block_tables.reshape(-1)
             block_tables = device_array(
@@ -1706,3 +1718,34 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
             mappings=mappings,
             transpose_keys=transpose_keys,
             shard=shard)
+    def get_intermediate_tensor_spec(self, num_tokens: int):
+        jax_dtype = to_jax_dtype(self.dtype)
+        num_padded_tokens = runner_utils.get_padded_token_len(
+            self.num_tokens_paddings, num_tokens)
+        sharding = NamedSharding(self.mesh, PartitionSpec())
+        hidden_size = self.model_config.get_hidden_size()
+        spec = jax.ShapeDtypeStruct(shape=(num_padded_tokens, hidden_size),
+                                    dtype=jax_dtype,
+                                    sharding=sharding)
+        tensor_spec = {"hidden_states": spec, "residual": spec}
+        return tensor_spec
+    def get_uuid_for_jax_transfer(self,
+                                  scheduler_output: "VllmSchedulerOutput",
+                                  rank: int, step: int) -> int:
+        '''
+        Get a uuid for jax.transfer, here we use the hash of
+        scheduler_output + counter_step + sender's rank
+        '''
+        scheduler_output_str = ""
+        if not scheduler_output.num_scheduled_tokens:
+            scheduler_output_str = "empty_batch"
+        else:
+            scheduler_output_str = str(
+                sorted(scheduler_output.num_scheduled_tokens.items()))
+        unique_str = f'{scheduler_output_str} {step} {rank}'
+        import hashlib
+        hasher = hashlib.sha1()
+        hasher.update(unique_str.encode('utf-8'))
+        return int.from_bytes(hasher.digest()[:8], 'big')

tpu_inference/runner/utils.py CHANGED Viewed

@@ -15,6 +15,7 @@ import jax
 from jax._src.interpreters import pxla
 from vllm.v1.core.sched.output import SchedulerOutput as VllmSchedulerOutput
+from tpu_inference import envs
 from tpu_inference.logger import init_logger
 from tpu_inference.runner.input_batch import InputBatch
@@ -306,8 +307,7 @@ class PhasedBasedProfiler:
             InferencePhase.BALANCED: False
         }
         self.default_profiling_options = jax.profiler.ProfileOptions()
-        self.default_profiling_options.python_tracer_level = os.getenv(
-            "PYTHON_TRACER_LEVEL", 0)
+        self.default_profiling_options.python_tracer_level = envs.PYTHON_TRACER_LEVEL
         self.current_phase: str = ""

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl