PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (59) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/kernels/mla_v1_test.py +129 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +3 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +3 -1
tests/lora/test_layers.py +4 -1
tests/lora/test_lora_perf.py +53 -0
tests/test_envs.py +110 -12
tests/test_quantization.py +3 -0
tests/test_utils.py +1 -2
tpu_inference/distributed/tpu_connector.py +1 -1
tpu_inference/envs.py +92 -8
tpu_inference/executors/ray_distributed_executor.py +5 -1
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/fused_moe/v1/kernel.py +712 -143
tpu_inference/kernels/mla/v1/kernel.py +98 -120
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +82 -32
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +146 -85
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/util.py +2 -1
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +11 -7
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +232 -64
tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +170 -208
tpu_inference/layers/vllm/linear_common.py +43 -21
tpu_inference/layers/vllm/quantization/common.py +11 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +74 -65
tpu_inference/layers/vllm/quantization/mxfp4.py +140 -94
tpu_inference/layers/vllm/quantization/unquantized.py +103 -80
tpu_inference/models/common/model_loader.py +78 -22
tpu_inference/models/jax/deepseek_v3.py +185 -64
tpu_inference/models/jax/gpt_oss.py +3 -3
tpu_inference/models/jax/llama_eagle3.py +4 -5
tpu_inference/models/jax/qwen2_5_vl.py +161 -47
tpu_inference/models/jax/utils/quantization/quantization_utils.py +7 -8
tpu_inference/models/jax/utils/weight_utils.py +203 -155
tpu_inference/models/vllm/vllm_model_wrapper.py +11 -5
tpu_inference/platforms/tpu_platform.py +29 -48
tpu_inference/runner/compilation_manager.py +112 -46
tpu_inference/runner/kv_cache.py +40 -20
tpu_inference/runner/kv_cache_manager.py +40 -31
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +94 -51
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -22
tpu_inference/utils.py +41 -14
tpu_inference/worker/tpu_worker.py +43 -45
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/METADATA +8 -9
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/RECORD +59 -58
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/top_level.txt +0 -0

tpu_inference/spec_decode/jax/eagle3.py CHANGED Viewed

@@ -6,13 +6,19 @@ from typing import Any, Optional
 import jax
 import jax.numpy as jnp
 import numpy as np
+from flax import nnx
+from jax import lax
+from jax.sharding import NamedSharding, PartitionSpec
 from vllm.config import VllmConfig
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.logger import init_logger
 from tpu_inference.models.common.model_loader import get_model
 from tpu_inference.runner import utils as runner_utils
 from tpu_inference.utils import device_array
+logger = init_logger(__name__)
 class Eagle3Proposer:
     """A proposer for speculative decoding using the Eagle3 method.
@@ -51,9 +57,22 @@ class Eagle3Proposer:
         """Loads the draft model."""
         self.model_fn, self.compute_logits_fn, self.combine_hidden_states_fn, _, self.state, _, _ = get_model(
             self.vllm_config, self.rng_key, self.mesh, is_draft_model=True)
-        if 'embed_tokens' in self.state.model:
-            del self.state.model['embed_tokens']
-        self.state.model.embed_tokens = target_model.model.embed
+        draft_embed_tokens = getattr(self.state.model, 'embed_tokens', None)
+        if draft_embed_tokens is None or ~jnp.any(
+                draft_embed_tokens.embedding):
+            logger.info(
+                "Draft model does not have embedding. Setting draft model's embed_tokens to target model's embed"
+            )
+            self.state.model.embed_tokens = target_model.model.embed
+        elif jnp.array_equal(draft_embed_tokens.embedding,
+                             target_model.model.embed.embedding):
+            logger.info(
+                "Draft model's embed_tokens is identical to target model's embed. Sharing the embedding."
+            )
+            self.state.model.embed_tokens = target_model.model.embed
+        else:
+            logger.info("Draft model has its own embed_tokens.")
     @functools.partial(jax.jit, static_argnums=(0, ))
     def _prepare_input_ids(
@@ -111,6 +130,17 @@ class Eagle3Proposer:
                                            max_num_blocks_per_req)
         new_block_tables = jnp.where(expanded_exceeds_mask, -1, block_tables)
+        positions = lax.with_sharding_constraint(
+            positions, NamedSharding(self.mesh, PartitionSpec(None, )))
+        clamped_positions = lax.with_sharding_constraint(
+            clamped_positions, NamedSharding(self.mesh, PartitionSpec(None, )))
+        new_seq_lens = lax.with_sharding_constraint(
+            new_seq_lens, NamedSharding(self.mesh, PartitionSpec(None, )))
+        query_start_loc = lax.with_sharding_constraint(
+            query_start_loc, NamedSharding(self.mesh, PartitionSpec()))
+        new_block_tables = lax.with_sharding_constraint(
+            new_block_tables, NamedSharding(self.mesh, PartitionSpec(None, )))
         return positions, clamped_positions, new_seq_lens, query_start_loc, new_block_tables
     @functools.partial(jax.jit, static_argnums=(0, ))
@@ -122,6 +152,7 @@ class Eagle3Proposer:
     @functools.partial(jax.jit, static_argnums=(0, ))
     def _prepare_hidden_states_and_input_ids(
         self,
+        state: nnx.State,
         aux_hidden_states: tuple[jax.Array, ...],
         query_start_loc: jax.Array,
         target_token_ids: jax.Array,
@@ -130,7 +161,7 @@ class Eagle3Proposer:
     ) -> tuple[jax.Array, jax.Array, jax.Array]:
         target_hidden_states = jnp.concatenate(aux_hidden_states, axis=-1)
         target_hidden_states = self.combine_hidden_states_fn(
-            self.state, target_hidden_states)
+            state, target_hidden_states)
         input_ids, last_token_indices = self._prepare_input_ids(
             query_start_loc, target_token_ids, next_token_ids, num_reqs)
@@ -177,8 +208,8 @@ class Eagle3Proposer:
                                     block_tables=device_array(
                                         self.mesh, block_tables))
             target_hidden_states, input_ids, last_token_indices = self._prepare_hidden_states_and_input_ids(
-                aux_hidden_states, attn_metadata.query_start_loc, input_ids,
-                next_token_ids, num_reqs)
+                self.state, aux_hidden_states, attn_metadata.query_start_loc,
+                input_ids, next_token_ids, num_reqs)
             return target_hidden_states, input_ids, last_token_indices, attn_metadata
         # Host copies from the metadata prepared by the runner.
@@ -242,12 +273,13 @@ class Eagle3Proposer:
         attn_metadata = replace(attn_metadata, block_tables=block_tables)
         return self._filter_token_and_prepare_initial_inputs(
-            token_indices, query_start_loc, seq_lens, input_ids,
+            self.state, token_indices, query_start_loc, seq_lens, input_ids,
             aux_hidden_states, attn_metadata, next_token_ids, num_reqs)
     @functools.partial(jax.jit, static_argnums=(0, ))
     def _filter_token_and_prepare_initial_inputs(
         self,
+        state: nnx.State,
         token_indices: jax.Array,
         query_start_loc: jax.Array,
         seq_lens: jax.Array,
@@ -275,35 +307,51 @@ class Eagle3Proposer:
         )
         target_hidden_states, input_ids, last_token_indices = self._prepare_hidden_states_and_input_ids(
-            [h[token_indices] for h in aux_hidden_states], query_start_loc,
-            target_token_ids, next_token_ids, num_reqs)
+            state, [h[token_indices] for h in aux_hidden_states],
+            query_start_loc, target_token_ids, next_token_ids, num_reqs)
         return target_hidden_states, input_ids, last_token_indices, attn_metadata
     @functools.partial(jax.jit, static_argnums=(0, ))
     def _select_draft_token_ids(
         self,
+        state: nnx.State,
         hidden_states: jax.Array,
         last_token_indices: jax.Array,
     ) -> jax.Array:
         sample_hidden_states = hidden_states[last_token_indices]
-        return self._get_draft_token_ids(sample_hidden_states)
+        sample_hidden_states = lax.with_sharding_constraint(
+            sample_hidden_states,
+            NamedSharding(self.mesh, PartitionSpec(None, None)))
+        return self._get_draft_token_ids(state, sample_hidden_states)
     @functools.partial(jax.jit, static_argnums=(0, ))
-    def _get_draft_token_ids(self, hidden_states: jax.Array) -> jax.Array:
+    def _get_draft_token_ids(self, state: nnx.State,
+                             hidden_states: jax.Array) -> jax.Array:
         lora_metadata = None
-        logits = self.compute_logits_fn(self.state, hidden_states,
-                                        lora_metadata)
-        return jnp.argmax(logits, axis=-1)
+        logits = self.compute_logits_fn(state, hidden_states, lora_metadata)
+        draft_token_ids = jnp.argmax(logits, axis=-1)
+        return lax.with_sharding_constraint(
+            draft_token_ids, NamedSharding(self.mesh, PartitionSpec()))
     @functools.partial(jax.jit, static_argnums=(0, ))
     def _select_inputs_for_loop_speculation(
-            self, positions: jax.Array, residual: jax.Array,
+            self, state: nnx.State, positions: jax.Array, residual: jax.Array,
             hidden_states: jax.Array,
             last_token_indices: jax.Array) -> tuple[jax.Array, jax.Array]:
-        return positions[last_token_indices], residual[
-            last_token_indices], self._select_draft_token_ids(
-                hidden_states, last_token_indices)
+        positions = positions[last_token_indices]
+        residual = residual[last_token_indices]
+        draft_token_ids = self._select_draft_token_ids(state, hidden_states,
+                                                       last_token_indices)
+        positions = lax.with_sharding_constraint(
+            positions, NamedSharding(self.mesh, PartitionSpec(None, )))
+        residual = lax.with_sharding_constraint(
+            residual, NamedSharding(self.mesh, PartitionSpec(None, None)))
+        draft_token_ids = lax.with_sharding_constraint(
+            draft_token_ids, NamedSharding(self.mesh, PartitionSpec()))
+        return positions, residual, draft_token_ids
     def propose(
         self,
@@ -330,11 +378,11 @@ class Eagle3Proposer:
         if self.num_speculative_tokens == 1:
             return kv_caches, self._select_draft_token_ids(
-                hidden_states, last_token_indices)
+                self.state, hidden_states, last_token_indices)
         positions, hidden_states, draft_token_ids = self._select_inputs_for_loop_speculation(
-            attn_metadata.input_positions, residual[0], hidden_states,
-            last_token_indices)
+            self.state, attn_metadata.input_positions, residual[0],
+            hidden_states, last_token_indices)
         draft_token_ids_list = [draft_token_ids]
@@ -359,7 +407,8 @@ class Eagle3Proposer:
                 attn_metadata,
             )
             hidden_states = residual[0]
-            draft_token_ids = self._get_draft_token_ids(new_hidden_states)
+            draft_token_ids = self._get_draft_token_ids(
+                self.state, new_hidden_states)
             draft_token_ids_list.append(draft_token_ids)
         # [batch_size, num_speculative_tokens]

tpu_inference/utils.py CHANGED Viewed

@@ -8,11 +8,14 @@ from typing import Any, Callable, List, Tuple
 import jax
 import jax.numpy as jnp
 import numpy as np
+import torch
 from jax._src import dtypes
 from jax._src import mesh as mesh_lib
 from jax._src import xla_bridge as xb
 from jax._src.lib import xla_client as xc
+from jax._src.numpy.scalar_types import _ScalarMeta
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torchax.ops.mappings import j2t_dtype, t2j_dtype
 from vllm import envs as vllm_envs
 from vllm import utils
@@ -23,21 +26,44 @@ GBYTES = 1024 * 1024 * 1024
 TPU_HEAD_SIZE_ALIGNMENT = 128
 TPU_SECOND_LAST_MINOR = 8
-# This is used to translate from a string name for a dtype
-# to formal jax.numpy DType.  One use case for this is
-# converting the `--kv_cache_dtype` flag to a dtype.
-TPU_STR_DTYPE_TO_JAX_DTYPE = {
-    "bfloat16": jnp.bfloat16,
-    "fp8": jnp.float8_e4m3fn,
-    "fp8_e4m3": jnp.float8_e4m3,
-    "fp8_e5m2": jnp.float8_e5m2,
-    "int8": jnp.int8,
+# Map vllm dtype string that doesn't exactly match jax dtype string name.
+_VLLM_DTYPE_STR_TO_JAX_DTYPE = {
+    "fp8": jnp.float8_e4m3fn.dtype,
+    "fp8_e4m3": jnp.float8_e4m3fn.dtype,
+    "fp8_e5m2": jnp.float8_e5m2.dtype,
 }
+def to_jax_dtype(dtype: str | jnp.dtype | torch.dtype) -> jnp.dtype:
+    if isinstance(dtype, str):
+        if dict_dtype := _VLLM_DTYPE_STR_TO_JAX_DTYPE.get(dtype, None):
+            return dict_dtype
+        return jnp.dtype(dtype)
+    elif isinstance(dtype, torch.dtype):
+        return t2j_dtype(dtype)
+    elif isinstance(dtype, jnp.dtype):
+        return dtype
+    elif isinstance(dtype, _ScalarMeta):
+        return dtype.dtype
+    else:
+        raise ValueError(f"Argument is unsupported data type {type(dtype)}")
+def to_torch_dtype(dtype: str | jnp.dtype | torch.dtype) -> torch.dtype:
+    # Use jax dtype as an intermediate dtype which we'll be used to convert it
+    # into torch dtype.
+    dtype = to_jax_dtype(dtype)
+    return j2t_dtype(dtype)
 _megacore = False
 logger = init_logger(__name__)
+def align_to(unpadded_dim, pad_multiple):
+    return (unpadded_dim + pad_multiple - 1) // pad_multiple * pad_multiple
 def enable_megacore() -> None:
     global _megacore
     _megacore = True
@@ -164,7 +190,8 @@ def get_padded_num_heads(num_heads: int, sharding_size: int) -> int:
 def get_dtype_packing(dtype):
-    bits = dtypes.bit_width(dtype)
+    bits = (dtypes.bit_width(dtype)
+            if hasattr(dtypes, "bit_width") else dtypes.itemsize_bits(dtype))
     return 32 // bits
@@ -249,11 +276,11 @@ def device_array(mesh: Mesh, *args, sharding=None, **kwargs) -> jax.Array:
 def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
     """
-    A wrapper function of vllm.utils.get_hash_fn_by_name to support builtin
+    A wrapper function of vllm.utils.hashing.get_hash_fn_by_name to support builtin
     """
     if hash_fn_name == "builtin":
         return hash
-    return utils.get_hash_fn_by_name(hash_fn_name)
+    return utils.hashing.get_hash_fn_by_name(hash_fn_name)
 def quantize_kv(key: jax.Array, value: jax.Array,
@@ -295,8 +322,8 @@ def get_jax_dtype_from_str_dtype(str_dtype: str) -> jnp.dtype:
     Returns:
         jnp.dtype: The JAX dtype.
     """
-    str_dtype = str_dtype.lower().strip()
-    return TPU_STR_DTYPE_TO_JAX_DTYPE.get(str_dtype)
+    # TODO(kyuyeunk): Replace all reference of this function into TpuDtype.
+    return to_jax_dtype(str_dtype)
 def time_function(func):

tpu_inference/worker/tpu_worker.py CHANGED Viewed

@@ -6,7 +6,6 @@ from dataclasses import dataclass, field
 from typing import Callable, Dict, Optional, Tuple
 import jax
-import jax.numpy as jnp
 import jaxlib
 import jaxtyping
 import vllm.envs as vllm_envs
@@ -19,7 +18,8 @@ from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
 from vllm.v1 import utils as vllm_utils
-from vllm.v1.core.kv_cache_utils import get_num_blocks, get_uniform_page_size
+from vllm.v1.core.kv_cache_utils import (get_kv_cache_groups, get_num_blocks,
+                                         get_uniform_page_size)
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
@@ -32,17 +32,11 @@ from tpu_inference.layers.common.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.jax_intermediate_tensor import \
     JaxIntermediateTensors
-from tpu_inference.runner.kv_cache import get_rpa_page_size_bytes
+from tpu_inference.runner.kv_cache import get_attention_page_size_bytes
 from tpu_inference.runner.tpu_runner import TPUModelRunner
 logger = init_logger(__name__)
-_DTYPE: dict[str, jnp.dtype] = {
-    "bfloat16": jnp.bfloat16,
-    "float": jnp.float32,
-    "float32": jnp.float32,
-}
 @dataclass
 class PPConfig:
@@ -77,21 +71,6 @@ class TPUWorker:
         ip: str = "localhost",
         prev_worker_ip: str = "localhost",
     ):
-        # If we use vLLM's model implementation in PyTorch, we should set it
-        # with torch version of the dtype.
-        impl = envs.MODEL_IMPL_TYPE
-        if impl != "vllm":  # vllm-pytorch implementation does not need this conversion
-            # NOTE(wenlong): because sometimes mm needs to use torch for preprocessing
-            if not isinstance(vllm_config.model_config.dtype, str):
-                logger.warning(
-                    "The model dtype is not properly set for JAX backend. "
-                    "Overwriting it to jnp.bfloat16")
-                vllm_config.model_config.dtype = jnp.bfloat16
-            else:
-                vllm_config.model_config.dtype = _DTYPE.get(
-                    vllm_config.model_config.dtype, jnp.bfloat16)
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.parallel_config = vllm_config.parallel_config
@@ -108,7 +87,7 @@ class TPUWorker:
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
+            from vllm.utils.import_utils import init_cached_hf_modules
             init_cached_hf_modules()
@@ -250,11 +229,20 @@ class TPUWorker:
             need_pp=self.parallel_config.pipeline_parallel_size > 1)
         ensure_kv_transfer_initialized(self.vllm_config)
-        self.model_runner = TPUModelRunner(
-            self.vllm_config, self.devices, self.rank, self.rank == 0,
-            self.rank == self.pp_config.pp_world_size - 1)
+        is_first_rank = True
+        is_last_rank = True
+        if self.parallel_config.pipeline_parallel_size > 1:
+            is_first_rank = self.rank == 0
+            is_last_rank = self.rank == self.pp_config.pp_world_size - 1
+        self.model_runner = TPUModelRunner(self.vllm_config, self.devices,
+                                           self.rank, is_first_rank,
+                                           is_last_rank)
         logger.info(f"Init worker | "
                     f"rank={self.rank} | "
+                    f"is_first_rank={is_first_rank} | "
+                    f"is_last_rank={is_last_rank} | "
                     f"node_id={get_node_id()} | "
                     f"is_driver_worker={self.is_driver_worker} | "
                     f"hbm={utils.hbm_usage_gb(self.devices)}GiB")
@@ -357,7 +345,7 @@ class TPUWorker:
         if is_start:
             options = jax.profiler.ProfileOptions()
             # default: https://docs.jax.dev/en/latest/profiling.html#general-options
-            options.python_tracer_level = os.getenv("PYTHON_TRACER_LEVEL", 0)
+            options.python_tracer_level = envs.PYTHON_TRACER_LEVEL
             options.host_tracer_level = os.getenv("HOST_TRACER_LEVEL", 1)
             jax.profiler.start_trace(self.profile_dir,
                                      profiler_options=options)
@@ -395,32 +383,37 @@ class TPUWorker:
         # responsible for this translation. When vLLM can be modified, this
         # method should be changed to return `dict[str, AbstractKVCacheSpec]`,
         # and the vLLM side should be updated to handle the translation.
-        kv_cache_specs = self.model_runner.get_kv_cache_spec()
+        kv_cache_spec = self.model_runner.get_kv_cache_spec()
-        if len(kv_cache_specs) == 0:
-            return kv_cache_specs
+        if len(kv_cache_spec) == 0:
+            return kv_cache_spec
         # TODO(kyuyeunk): Instead of checking page_size_bytes here, introduce
         # feature that allows overriding page_size_bytes of KVCacheSpec.
-        vllm_page_size_bytes = get_uniform_page_size(kv_cache_specs)
-        rpa_page_size_bytes = get_rpa_page_size_bytes(self.model_runner.mesh,
-                                                      kv_cache_specs)
+        vllm_page_size_bytes = get_uniform_page_size(
+            list(kv_cache_spec.values()))
+        attention_page_size_bytes = get_attention_page_size_bytes(
+            self.model_runner.mesh, kv_cache_spec)
-        if vllm_page_size_bytes != rpa_page_size_bytes:
+        if vllm_page_size_bytes != attention_page_size_bytes:
             logger.info(
-                f"KV cache page size calculated by vLLM "
-                f"({vllm_page_size_bytes} Bytes) does not match with actual "
-                f"page size used by RPA kernel ({rpa_page_size_bytes} Bytes). "
-                f"Recalculating number of KV blocks using actual page size.")
+                f"Page size calculated by vLLM ({vllm_page_size_bytes} Bytes) "
+                f"does not match with actual page size used by the kernel "
+                f"({attention_page_size_bytes} Bytes). Recalculating number of "
+                f"KV blocks using actual page size.")
+            kv_cache_groups = get_kv_cache_groups(self.vllm_config,
+                                                  kv_cache_spec)
+            group_size = max(
+                len(group.layer_names) for group in kv_cache_groups)
             available_memory = self.determine_available_memory()
-            num_blocks = get_num_blocks(self.vllm_config, len(kv_cache_specs),
-                                        available_memory, rpa_page_size_bytes)
+            num_blocks = get_num_blocks(self.vllm_config, group_size,
+                                        available_memory,
+                                        attention_page_size_bytes)
             cache_config = self.vllm_config.cache_config
             cache_config.num_gpu_blocks_override = num_blocks
-        return kv_cache_specs
+        return kv_cache_spec
     def initialize_from_config(
         self,
@@ -455,3 +448,8 @@ class TPUWorker:
     def shutdown(self) -> None:
         return
+    # Ray executor do not need handshake metadata
+    # as we pass the kv_parameters through proxy server
+    def get_kv_connector_handshake_metadata(self) -> None:
+        pass

{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tpu_inference
-Version: 0.11.1.dev202511220812
+Version: 0.12.0.dev20251213
 Author: tpu_inference Contributors
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
@@ -14,7 +14,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: tpu-info==0.4.0
+Requires-Dist: tpu-info==0.7.1
 Requires-Dist: yapf==0.43.0
 Requires-Dist: pytest
 Requires-Dist: pytest-mock
@@ -25,12 +25,13 @@ Requires-Dist: jax[tpu]==0.8.0
 Requires-Dist: jaxlib==0.8.0
 Requires-Dist: jaxtyping
 Requires-Dist: flax==0.11.1
-Requires-Dist: torchax==0.0.7
+Requires-Dist: torchax==0.0.10
 Requires-Dist: qwix==0.1.1
 Requires-Dist: torchvision==0.24.0
 Requires-Dist: pathwaysutils
 Requires-Dist: parameterized
 Requires-Dist: numba==0.62.1
+Requires-Dist: runai-model-streamer[gcs,s3]==0.15.0
 Dynamic: author
 Dynamic: classifier
 Dynamic: description
@@ -52,14 +53,12 @@ Dynamic: requires-python
 ---
-_Upcoming Events_ 🔥
-- Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) in San Francisco!
-- Join us at [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco!
-- Join us at [JAX DevLab on November 18th](https://rsvp.withgoogle.com/events/devlab-fall-2025) in Sunnyvale!
 _Latest News_ 🔥
+- [Pytorch Conference](https://pytorchconference.sched.com/event/27QCh/sponsored-session-everything-everywhere-all-at-once-vllm-hardware-optionality-with-spotify-and-google-brittany-rockwell-google-shireen-kheradpey-spotify) Learn how Spotify uses vLLM with both GPUs and TPUs to drive down costs and improve user experience.
+- Check back soon for a recording of our session at [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco!
+- Check back soon for a recording of our session at [JAX DevLab on November 18th](https://rsvp.withgoogle.com/events/devlab-fall-2025) in Sunnyvale!
 - [2025/10] [vLLM TPU: A New Unified Backend Supporting PyTorch and JAX on TPU](https://blog.vllm.ai/2025/10/16/vllm-tpu.html)
 <details>

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl