PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511220812__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511220812py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (58) hide show

tests/lora/test_layers.py +0 -6
tests/lora/utils.py +0 -8
tests/test_envs.py +182 -0
tests/test_utils.py +23 -14
tpu_inference/__init__.py +22 -3
tpu_inference/core/core_tpu.py +17 -9
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +2 -3
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +1 -1
tpu_inference/executors/ray_distributed_executor.py +27 -11
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +77 -54
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +110 -64
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +7 -0
tpu_inference/layers/{jax → common}/attention_interface.py +1 -1
tpu_inference/layers/common/quant_methods.py +8 -0
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/quantization/__init__.py +7 -3
tpu_inference/layers/vllm/quantization/awq.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -2
tpu_inference/layers/vllm/quantization/mxfp4.py +266 -0
tpu_inference/layers/vllm/quantization/unquantized.py +4 -3
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +12 -11
tpu_inference/models/jax/llama3.py +4 -3
tpu_inference/models/jax/llama_eagle3.py +9 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +3 -2
tpu_inference/models/jax/qwen2_5_vl.py +4 -3
tpu_inference/models/jax/qwen3.py +3 -2
tpu_inference/models/jax/utils/weight_utils.py +21 -8
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -10
tpu_inference/platforms/tpu_platform.py +17 -7
tpu_inference/runner/compilation_manager.py +37 -17
tpu_inference/runner/kv_cache.py +1 -1
tpu_inference/runner/kv_cache_manager.py +8 -2
tpu_inference/runner/tpu_runner.py +199 -87
tpu_inference/spec_decode/jax/eagle3.py +2 -1
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +7 -6
tpu_inference/worker/tpu_worker.py +159 -23
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/METADATA +2 -2
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/RECORD +52 -54
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
/tpu_inference/layers/{jax → common}/binary_search.py +0 -0
/tpu_inference/layers/{jax → common}/sharding.py +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/top_level.txt +0 -0

tpu_inference/tpu_info.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import requests
+from tpu_inference import envs
 from tpu_inference.logger import init_logger
 logger = init_logger(__name__)
@@ -32,14 +33,14 @@ def get_tpu_metadata(key: str = "") -> str:
 def get_tpu_type() -> str:
-    tpu_type = os.getenv("TPU_ACCELERATOR_TYPE", None)
+    tpu_type = envs.TPU_ACCELERATOR_TYPE
     if tpu_type is None:
         tpu_type = get_tpu_metadata(key="accelerator-type")
     return tpu_type
 def get_node_name() -> str:
-    tpu_name = os.getenv("TPU_NAME", None)
+    tpu_name = envs.TPU_NAME
     if not tpu_name:
         tpu_name = get_tpu_metadata(key="instance-id")
     return tpu_name
@@ -47,7 +48,7 @@ def get_node_name() -> str:
 def get_node_worker_id() -> int:
     """For multi-host TPU VM, this returns the worker id for the current node."""
-    worker_id = os.getenv("TPU_WORKER_ID", None)
+    worker_id = envs.TPU_WORKER_ID
     if worker_id is None:
         worker_id = get_tpu_metadata(key="agent-worker-number")
     if worker_id is None:

tpu_inference/utils.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import os
 import time
 from collections import defaultdict
 from collections.abc import Sequence
@@ -14,8 +13,10 @@ from jax._src import mesh as mesh_lib
 from jax._src import xla_bridge as xb
 from jax._src.lib import xla_client as xc
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
-from vllm import envs, utils
+from vllm import envs as vllm_envs
+from vllm import utils
+from tpu_inference import envs
 from tpu_inference.logger import init_logger
 GBYTES = 1024 * 1024 * 1024
@@ -57,10 +58,10 @@ def get_num_kv_heads_by_tp(num_kv_heads: int, tp_size: int) -> int:
 def hbm_usage_bytes(devices: Any) -> List[Tuple[int, int]]:
     usage = []
-    if envs.VLLM_TPU_USING_PATHWAYS:
+    if vllm_envs.VLLM_TPU_USING_PATHWAYS:
         return pathways_hbm_usage_gb(devices)
-    multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
+    multihost_backend = envs.TPU_MULTIHOST_BACKEND
     if multihost_backend == "ray":
         # MemoryStats is only supported for addressable PjRt devices.
         # Assume all the devices have similar memory usage for now.
@@ -132,8 +133,8 @@ def pathways_hbm_usage_gb(devices: Any) -> List[Tuple[float, float]]:
     hbm_used = defaultdict(int)
     hbm_limit = get_device_hbm_limit()
     for array in live_arrays:
-        for buffer in array.device_buffers:
-            hbm_used[buffer.device] += buffer.nbytes
+        for buffer in array.addressable_shards:
+            hbm_used[buffer.data.device] += buffer.data.nbytes
     return [(hbm_used[device], hbm_limit) for device in devices]

tpu_inference/worker/tpu_worker.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import os
 import tempfile
+from dataclasses import dataclass, field
 from typing import Callable, Dict, Optional, Tuple
 import jax
@@ -10,6 +11,7 @@ import jaxlib
 import jaxtyping
 import vllm.envs as vllm_envs
 from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed import get_pp_group
 from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized,
                                           has_kv_transfer_group)
 from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
@@ -23,10 +25,13 @@ from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from tpu_inference import envs, utils
+from tpu_inference.distributed import jax_parallel_state
 from tpu_inference.distributed.utils import (get_host_ip, get_kv_transfer_port,
                                              get_node_id)
-from tpu_inference.layers.jax.sharding import ShardingConfigManager
+from tpu_inference.layers.common.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
+from tpu_inference.models.jax.jax_intermediate_tensor import \
+    JaxIntermediateTensors
 from tpu_inference.runner.kv_cache import get_rpa_page_size_bytes
 from tpu_inference.runner.tpu_runner import TPUModelRunner
@@ -39,15 +44,39 @@ _DTYPE: dict[str, jnp.dtype] = {
 }
+@dataclass
+class PPConfig:
+    rank: int
+    ip: str
+    prev_worker_ip: str
+    pp_world_size: int
+    # default env vars for
+    # TPU_PROCESS_BOUNDS, TPU_CHIPS_PER_PROCESS_BOUNDS, TPU_VISIBLE_CHIPS
+    # if PP is used in single host.
+    default_tpu_process_bounds: str = field(init=False)
+    default_tpu_chips_per_process_bounds: str = field(init=False)
+    default_tpu_visible_chips: str = field(init=False)
+    def __post_init__(self):
+        self.default_tpu_process_bounds = f"1,{self.pp_world_size},1"
+        self.default_tpu_chips_per_process_bounds = "1,1,1"
+        self.default_tpu_visible_chips = f"{self.rank}"
 class TPUWorker:
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 local_rank: int,
-                 rank: int,
-                 distributed_init_method: str,
-                 is_driver_worker: bool = False,
-                 devices=None):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+        devices=None,
+        ip: str = "localhost",
+        prev_worker_ip: str = "localhost",
+    ):
         # If we use vLLM's model implementation in PyTorch, we should set it
         # with torch version of the dtype.
         impl = envs.MODEL_IMPL_TYPE
@@ -74,6 +103,8 @@ class TPUWorker:
         self.devices = devices if devices is not None else []
         self.device_ranks = set(device.id for device in self.devices
                                 if isinstance(device, jaxlib._jax.Device))
+        self.pp_config = PPConfig(rank, ip, prev_worker_ip,
+                                  self.parallel_config.pipeline_parallel_size)
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
@@ -86,7 +117,7 @@ class TPUWorker:
         # TPU Worker is initialized. The profiler server needs to start after
         # MP runtime is initialized.
         self.profile_dir = None
-        if vllm_envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+        if vllm_envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1 and self.pp_config.pp_world_size == 1:
             if not self.devices or 0 in self.device_ranks:
                 # For TPU, we can only have 1 active profiler session for 1 profiler
                 # server. So we only profile on rank0.
@@ -94,6 +125,14 @@ class TPUWorker:
                 logger.info("Profiling enabled. Traces will be saved to: %s",
                             self.profile_dir)
+        # For PP, we use MPMD so we want to profile every worker.
+        if self.pp_config.pp_world_size > 1 and vllm_envs.VLLM_TORCH_PROFILER_DIR:
+            self.profile_dir = os.path.join(
+                vllm_envs.VLLM_TORCH_PROFILER_DIR,
+                f"pprank_{self.rank}_ppworldsize_{self.pp_config.pp_world_size}"
+            )
+            os.makedirs(self.profile_dir, exist_ok=True)
         use_jax_profiler_server = os.getenv("USE_JAX_PROFILER_SERVER", False)
         # Only one instance of profiler is allowed
         if use_jax_profiler_server and self.rank < 1:
@@ -105,31 +144,87 @@ class TPUWorker:
                 )
                 jax.profiler.start_server(jax_profiler_server_port)
+        # step_counter is used to calculate uuid to transfer intermediate tensors.
+        self.step_counter = 0
     def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
-    def init_device(self):
+    def init_device(self,
+                    tpu_process_bounds="",
+                    tpu_chips_per_process_bounds="",
+                    tpu_visible_chips=""):
+        # set tpu visible devices for Jax runtime in single host PP.
+        multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
+        if multihost_backend != "ray" and self.parallel_config.pipeline_parallel_size > 1:
+            tpu_ports = [
+                jax_parallel_state.BASE_JAX_PORT + i
+                for i in range(self.pp_config.pp_world_size)
+            ]
+            os.environ["TPU_PROCESS_ADDRESSES"] = ",".join(
+                [f"localhost:{port}" for port in tpu_ports])
+            os.environ["TPU_PROCESS_PORT"] = f"{tpu_ports[self.rank]}"
+            os.environ["CLOUD_TPU_TASK_ID"] = f"{self.rank}"
+            # Note: Below is the setting for v6e8 host (8 chips of v6e)
+            # Replace with your own topology.
+            # There are 2 ways of subslicing a v6e
+            # 1) 2 slices with 4 TPU chips each, we can do PP=2, TP=1/2/3/4
+            #   TPU_PROCESS_BOUNDS = "1,1,1"
+            #   TPU_CHIPS_PER_PROCESS_BOUNDS = "1,4,1"
+            #   TPU_VISIBLE_CHIPS = "0,1,2,3" or "4,5,6,7"
+            # 2) 1 chip for each subslice, with at most 8 subslices,
+            #    we can do TP=1, PP=1/2/3/4/5/6/7/8
+            os.environ[
+                "TPU_PROCESS_BOUNDS"] = tpu_process_bounds \
+                    if tpu_process_bounds \
+                        else self.pp_config.default_tpu_process_bounds
+            os.environ[
+                "TPU_CHIPS_PER_PROCESS_BOUNDS"] = tpu_chips_per_process_bounds \
+                    if tpu_chips_per_process_bounds \
+                        else self.pp_config.default_tpu_chips_per_process_bounds
+            os.environ[
+                "TPU_VISIBLE_CHIPS"] = tpu_visible_chips \
+                    if tpu_visible_chips \
+                        else self.pp_config.default_tpu_visible_chips
         if not self.devices:
             sharding_config: ShardingConfigManager = self.vllm_config.sharding_config
             device_indexes = sharding_config.device_indexes
             if device_indexes is not None and len(device_indexes) > 0:
                 # Enforcing the devices sequence to be consistent with the specified device indexes
-                all_devices = jax.devices()
-                device_dict = {device.id: device for device in all_devices}
+                all_local_devices = jax.local_devices()
+                device_dict = {
+                    device.id: device
+                    for device in all_local_devices
+                }
                 self.devices = []
                 for device_index in device_indexes:
                     device = device_dict[device_index]
                     if device is None:
                         raise KeyError(
                             f"Device index {device_index} not found in "
-                            f"jax.devices() with IDs {list(device_dict.keys())}!"
+                            f"jax.local_devices() with IDs {list(device_dict.keys())}!"
                         )
                     self.devices.append(device)
+                assert len(self.devices) >= sharding_config.total_devices
                 self.devices = self.devices[:sharding_config.total_devices]
             else:
-                self.devices = jax.devices()[:sharding_config.total_devices]
+                if self.pp_config.pp_world_size > 1:
+                    # We only support a mixed tp + pp scenario that tp size is
+                    #  smaller or equals the total TPUs in one node
+                    # say: we have 4 nodes with 4 TPUs each, we can only do pp:4, tp:4, but not pp:2, tp:8
+                    assert jax.local_device_count(
+                    ) >= sharding_config.total_devices
+                    self.devices = jax.local_devices()[:sharding_config.
+                                                       total_devices]
+                else:
+                    # In a multi-host distributed env, say: Ray, local_device count may smaller
+                    # than the total devices, we just choose the smaller set here.
+                    self.devices = jax.devices()[:sharding_config.
+                                                 total_devices]
         # Initialize the vLLM distribution layer as a single chip environment,
         # we'll swap the model's parallel modules with TPU SPMD equivalents.
@@ -146,8 +241,18 @@ class TPUWorker:
                 tensor_model_parallel_size=1,
                 pipeline_model_parallel_size=1,
             )
+        jax_parallel_state.init_pp_distributed_environment(
+            self.pp_config.ip,
+            self.rank,
+            self.parallel_config.pipeline_parallel_size,
+            self.devices[0],
+            need_pp=self.parallel_config.pipeline_parallel_size > 1)
         ensure_kv_transfer_initialized(self.vllm_config)
-        self.model_runner = TPUModelRunner(self.vllm_config, self.devices)
+        self.model_runner = TPUModelRunner(
+            self.vllm_config, self.devices, self.rank, self.rank == 0,
+            self.rank == self.pp_config.pp_world_size - 1)
         logger.info(f"Init worker | "
                     f"rank={self.rank} | "
                     f"node_id={get_node_id()} | "
@@ -155,6 +260,12 @@ class TPUWorker:
                     f"hbm={utils.hbm_usage_gb(self.devices)}GiB")
         vllm_utils.report_usage_stats(self.vllm_config)
+    def initialize_pp_transfer_connect(self):
+        if self.rank == 0:
+            return
+        jax_parallel_state.connect(self.pp_config.prev_worker_ip,
+                                   self.rank - 1)
     def determine_available_memory(self) -> int:
         gpu_memory_utilization = self.cache_config.gpu_memory_utilization
         hbm_usage = utils.hbm_usage_bytes(self.devices)
@@ -194,14 +305,39 @@ class TPUWorker:
         # deliberate, temporary compromise for the same reasons outlined in
         # the `get_kv_cache_spec` method.
-        output = self.model_runner.execute_model(scheduler_output)
-        # With a connector, the scheduler expects output from all workers
-        # TODO(mrjunwan): Figure out if this is ok after https://github.com/vllm-project/vllm/pull/26866
-        if has_kv_transfer_group():
-            return output
-        return output if self.is_driver_worker else None
+        if self.parallel_config.pipeline_parallel_size == 1 or self.rank == 0:
+            intermediate_tensors = None
+        else:
+            # receive intermediate tensors
+            uuid = self.model_runner.get_uuid_for_jax_transfer(
+                scheduler_output, self.rank - 1, self.step_counter)
+            # TODO: this method might only works for vllm model, not sure about jax models.
+            tensor_spec = self.model_runner.get_intermediate_tensor_spec(
+                scheduler_output.total_num_scheduled_tokens)
+            intermediate_tensors_dict = get_pp_group().recv_tensor_dict(
+                uuid, tensor_spec)
+            intermediate_tensors = JaxIntermediateTensors(
+                intermediate_tensors_dict)
+        output = self.model_runner.execute_model(scheduler_output,
+                                                 intermediate_tensors)
+        if isinstance(output, JaxIntermediateTensors):
+            assert self.parallel_config.pipeline_parallel_size > 1
+            assert not get_pp_group().is_last_rank
+            # send intermediate tensors
+            uuid = self.model_runner.get_uuid_for_jax_transfer(
+                scheduler_output, self.rank, self.step_counter)
+            get_pp_group().send_tensor_dict(uuid, output.tensors)
+            self.step_counter += 1
+            return None
+        else:
+            self.step_counter += 1
+            # With a connector, the scheduler expects output from all workers
+            # TODO(mrjunwan): Figure out if this is ok after https://github.com/vllm-project/vllm/pull/26866
+            if has_kv_transfer_group():
+                return output
+            return output if self.is_driver_worker else None
     def sample_tokens(self,
                       grammar_output: GrammarOutput) -> ModelRunnerOutput:

{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tpu_inference
-Version: 0.11.1.dev202511130813
+Version: 0.11.1.dev202511220812
 Author: tpu_inference Contributors
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
@@ -27,7 +27,7 @@ Requires-Dist: jaxtyping
 Requires-Dist: flax==0.11.1
 Requires-Dist: torchax==0.0.7
 Requires-Dist: qwix==0.1.1
-Requires-Dist: torchvision==0.23.0
+Requires-Dist: torchvision==0.24.0
 Requires-Dist: pathwaysutils
 Requires-Dist: parameterized
 Requires-Dist: numba==0.62.1

{tpu_inference-0.11.1.dev202511130813.dist-info → tpu_inference-0.11.1.dev202511220812.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,9 @@
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/test_base.py,sha256=Ct5WFRMHL7IHEIxk8FrzAvO8m0xFuDpzDBKkAKKAL2Q,7341
+tests/test_envs.py,sha256=Woyfp_d5HS-uTGo4_u9dYlBbgmhfIEoFb-Rx_k7YXD4,6298
 tests/test_quantization.py,sha256=IT5ASyS1uuWcxc22kRtBcA-V4j3Z3hb7pMztm3GOlBs,34445
 tests/test_tpu_info.py,sha256=ZrwlMsp8ffITkS_b8Q1t_QG-a-WVAd4NUcjHhGibcsI,4670
-tests/test_utils.py,sha256=szRg4UB36RcgIvbEd9xMhKYbWi-O4XAUWGJlIU6FJ9E,7983
+tests/test_utils.py,sha256=Mta5ZzYCgRAh1-BjcOvvx9iQ9DnnXLps7oDHxVQp2yE,8236
 tests/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/core/test_core_tpu.py,sha256=r496rk1eOsK_F4nvm9zprl_T-RcO6eCUb7LuVReOZno,21413
 tests/core/test_disagg_executor.py,sha256=QdE2YZs08EyDDCmSjhiXkXqQ9BJTgO6csr_E1xkkfSg,2256
@@ -20,27 +21,27 @@ tests/kernels/ragged_paged_attention_kernel_v3_test.py,sha256=Hrd8iUkS1pS3rxeTyY
 tests/lora/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tests/lora/conftest.py,sha256=EXjwE1CjmUUlMEXpyE3UwxvgrKUllE73I8BNKfP1FTc,984
 tests/lora/test_bgmv.py,sha256=gQxWsJdNX2nkrE2xyrG0exwf3E2eHm2k2nkEXoANuQc,1359
-tests/lora/test_layers.py,sha256=21ekYlsK36r1GPZOfzs7E-KIsfI1JcuZl1E6vaQbHf4,26273
+tests/lora/test_layers.py,sha256=6B4HhMAItQmt0hPAQgyXgwSYs7b3bIbUf6LaPsqXLzY,25923
 tests/lora/test_lora.py,sha256=wJiF1P1BDnPN8TLX2tlFtdZ_QCkV-S9nPl6_uR6DqFc,4439
-tests/lora/utils.py,sha256=dR_v1H20vPVjFHdBhDajWOz0WJZlKuPLgMFQsME0LtA,3009
-tpu_inference/__init__.py,sha256=7IduGWw-_fwx0VA6EvC_AqHF67fnnShz6YvkqCfvFx8,1317
+tests/lora/utils.py,sha256=rY0tDZEZe58ye4-ykwrTnsiWuLcaEG57N_Rua90bDXI,2726
+tpu_inference/__init__.py,sha256=p4MaepRdN7723FUNE-3pOMxZWjFn4_TVFgjrNyty4JE,2304
 tpu_inference/env_override.py,sha256=pmL7lfs_rGCP92ya3wuWuudsCYeOMZ6tFZY82A4KkQc,365
-tpu_inference/envs.py,sha256=MTT_Pdtd6cAcciYjv1OekEmvspaq3SYL0oR_jDkQ_aE,3948
+tpu_inference/envs.py,sha256=hoPuT0SyLCxqyZ0QJIha6EXSZv2TpACfmENuiT0iJMM,3956
 tpu_inference/logger.py,sha256=HQCz7NefmbturuhOC7-3Ixbtcdgoz4g9FHh2RB6o8cc,334
-tpu_inference/tpu_info.py,sha256=9UohshkndR6dZpGWpWXfTD4qvIVdVgHf0yOoSEkLTrw,2276
-tpu_inference/utils.py,sha256=LWEshJgUdB20H2fDA-QI-Sk4EP7PD_FWvW3Mrqb-k8M,10054
+tpu_inference/tpu_info.py,sha256=3iilHRQSFjwMJwhKcuuawTm7mhwkgHbj4zi6CiAySrs,2265
+tpu_inference/utils.py,sha256=Ddsx2CY2ARe46RZL27URzXCN3P6pMcKWB-APXUB8sHs,10098
 tpu_inference/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/core/core_tpu.py,sha256=JdN4-xaxSWnzY4T181SCnbZ5HEnwQ5IifYA9ybF4pWo,32710
+tpu_inference/core/core_tpu.py,sha256=WDD3koE_j1QhWS2BbMA2aQOZayPZm4tYPvzL4YCX2jY,33294
 tpu_inference/core/disagg_executor.py,sha256=HZpgYMVxRxm0RQxO4l8IDYBWJ6Z3Tac6xavc5otcirc,4657
-tpu_inference/core/disagg_utils.py,sha256=ufWNFWQ5n4YnZpPOtoReHlYo4dlN7AbIqCyqS4an0t4,1572
+tpu_inference/core/disagg_utils.py,sha256=lv8MAVoAjtcmTaenUXVokg2q3d0tzsma86UiQlQ3omY,1492
 tpu_inference/core/sched/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/core/sched/dp_scheduler.py,sha256=mKs8Ms46szdlBfo8hjdqis2ZKAZbcKnHAGfEr0X5R8g,22527
 tpu_inference/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/distributed/jax_parallel_state.py,sha256=5_xCwcL03lFPUoSO_OP7hIVKpUFroW1m-jVO7R6FbUc,2223
-tpu_inference/distributed/tpu_connector.py,sha256=Zah46Sm5iOuh72SzXw69NxMc0MLnqsLEpe2BfDhpnqA,29731
-tpu_inference/distributed/utils.py,sha256=RwFQi8G4TzN1g9RjQu0pb5JxSc_jhoIZVsFJo0uHjxo,1513
+tpu_inference/distributed/tpu_connector.py,sha256=w_gOI6hX7NWefaxN_9XH9TXReGElOyFifdDHpPswotM,29696
+tpu_inference/distributed/utils.py,sha256=1KIREn28Zg10O-MSUkVQMRzS09WoGc_VLGOX4QTFJac,1504
 tpu_inference/executors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/executors/ray_distributed_executor.py,sha256=UgJP-XSgDPKDj_mkVQ16XrRN96juVpnFl6fdWEyFL_Y,15249
+tpu_inference/executors/ray_distributed_executor.py,sha256=emYfSFJ3kluEmi6mlfnvxSUrC_mGVRVcjrUqUH2MR4g,16122
 tpu_inference/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/experimental/llama3_jax_stashed.py,sha256=YK1oSIfto9ALo-HB45XfSrbq9XgVbE4m2C-9zRwmSzI,10913
 tpu_inference/kernels/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -66,27 +67,28 @@ tpu_inference/kernels/ragged_paged_attention/v2/kernel.py,sha256=OiQGAHhyggbp1Pe
 tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py,sha256=vGp2ZWODTbjyG9z2z0Qf_BX-wYHd5bUybnc_DtOz0nI,10995
 tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py,sha256=mw80bXBGenroGdrITV0F_EaI2s-Z9KWwqU9WodvJg14,97919
 tpu_inference/kernels/ragged_paged_attention/v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/kernels/ragged_paged_attention/v3/kernel.py,sha256=tlP6121yfXaukx_RQroHlHcZnbKPyyum0lAcvT0B_Pk,56132
-tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py,sha256=DFVdIIKmyufu_4b-3YhxI56jt0O1cJ3JsVl-2DDZHv4,55350
-tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py,sha256=leTS75aq99N1Zuv6wB5yLdkfYnEtrBDVI4z_jOKnjL0,142012
+tpu_inference/kernels/ragged_paged_attention/v3/kernel.py,sha256=O179Fft5KpuN5LIFx3SghWXJJUqh3Og-xqfO4Z8QXYU,57032
+tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py,sha256=z0oaH8ZkDmHSoG4yiiO2CN0kuAuFcEpQ3RUoi5msjlo,56904
+tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py,sha256=k3LwduhZO85cJ-pSgnGN0c2Nn8eNeQq4eA94KUXJzMw,142198
 tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py,sha256=P3_ivi8iUz5QMU_3pgpl4Bkbmn0q0NpDtVJX39haRQA,11208
 tpu_inference/kernels/ragged_paged_attention/v3/util.py,sha256=1N_ozjKboDYLteFJndWoLXNudj2z53rGXMkELa5Z9tY,1102
 tpu_inference/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/layers/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tpu_inference/layers/common/attention_interface.py,sha256=CImMS8tuWgvaRY9YbGS3pY7OBnzeJ4Jla7LRFb4Xoa4,13224
 tpu_inference/layers/common/attention_metadata.py,sha256=St8ZatbY1D7xQACKJH459jMgp3oTP3AQ36mi9FZdrPU,850
+tpu_inference/layers/common/binary_search.py,sha256=ZQi-z1wG6WTcfVQXeTGOZokX4K1DSf9kCzqfrhEU8lk,12320
+tpu_inference/layers/common/quant_methods.py,sha256=mQSxZ44-QQtm22C_8ViejnP1cP2Dv6yc2YaP6oMKJeQ,185
+tpu_inference/layers/common/sharding.py,sha256=wBqdkXZSWfnnH8pkJtyW2DSqmAe_V4Vxi0iMPaXq0Z0,25185
 tpu_inference/layers/jax/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/layers/jax/attention_interface.py,sha256=1jlvSZWaP6DuPVtb1W_KPw4-Qi68BikOBNLLcpygupY,13221
 tpu_inference/layers/jax/base.py,sha256=Vhts6ZMwNCZ8LbnEXeB0rl3nHdS5hDJWX7HEa7Fl7yE,5775
-tpu_inference/layers/jax/binary_search.py,sha256=ZQi-z1wG6WTcfVQXeTGOZokX4K1DSf9kCzqfrhEU8lk,12320
 tpu_inference/layers/jax/constants.py,sha256=NcYg0zAf3ClfP7YMYdYu_F1GngOzZaIxIAHBZDunKw4,2755
 tpu_inference/layers/jax/layers.py,sha256=yv_lC2tbJuzVL-OaXYooX82Ys8hWZATeH9M78coJ3VI,10633
 tpu_inference/layers/jax/misc.py,sha256=znKv1Nuq_LgYpaIu0qlzUVDgQWnjjG7aqPJGM8kuwcw,566
 tpu_inference/layers/jax/rope.py,sha256=i2E7pRLWgOaFLbeo8_phZwKQWJW7ohAyl69E2V2Mc2U,11349
 tpu_inference/layers/jax/rope_interface.py,sha256=X0SruXizlCHGnssFujC1pL07UC4Vsp7-gdBy_Q7JZhI,8375
-tpu_inference/layers/jax/sharding.py,sha256=wBqdkXZSWfnnH8pkJtyW2DSqmAe_V4Vxi0iMPaXq0Z0,25185
 tpu_inference/layers/jax/transformer_block.py,sha256=ufv-yfVDmRP_Ynrx3UX9xj-x0PkNw_tQ-0N0eYf4i7M,3917
 tpu_inference/layers/jax/attention/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/layers/jax/attention/attention.py,sha256=bWXMtF8TToiRyZ3SvJGQnD2urZTbuX_omHVXpQdn0fs,10082
+tpu_inference/layers/jax/attention/attention.py,sha256=DJFDkpQc9SDD156wVPFw3r2XaBgb44QNJ8OcdONaF5g,10085
 tpu_inference/layers/jax/attention/deepseek_v3_attention.py,sha256=YlagoBMwINv2KRH1dr4oEcH_cQ9QMPB55nO2FQZsWs0,14010
 tpu_inference/layers/jax/attention/gpt_oss_attention.py,sha256=rkrEv4aNZxtAGcXd1HXHUxhNeDNAd9nWTEZOKWSI8cA,8725
 tpu_inference/layers/jax/attention/llama4_attention.py,sha256=VvUmfBxQEbHf3F2BrcYDUnq5abj7CSDYeRsNx_eVAh0,6162
@@ -95,50 +97,46 @@ tpu_inference/layers/jax/moe/deepseek_v3_moe.py,sha256=Q6CuwwiZtWYm6iUee1wJoDJrw
 tpu_inference/layers/jax/moe/gpt_oss_moe.py,sha256=Rx5b1jg2XMm7Xx9hrjgvyhscaJ_zGbVMHmeEiLh7kIQ,6196
 tpu_inference/layers/jax/moe/moe.py,sha256=cA8R1rjbBwNEoNlsPWjeIBB9nvaRDwlEdwQTVg6lTpY,8762
 tpu_inference/layers/jax/sample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/layers/jax/sample/rejection_sampler.py,sha256=IRfVWjkbVXp9Sv1YrGMMh-LYx1AwbY-3FTXEO1-Ue9g,20423
-tpu_inference/layers/jax/sample/sampling.py,sha256=dVOcMdmPdAEsupPk96tCaZecIWUiDej0DiVnwaH9ckQ,3308
+tpu_inference/layers/jax/sample/rejection_sampler.py,sha256=nI5s0E73xkqDIu2hTljIXt23B1Q-gRnC1myoQpGDJrQ,20426
+tpu_inference/layers/jax/sample/sampling.py,sha256=C30KgmdOVSaagvHhbfLgVJtVQmJo86CbHPa4h36Vn70,3314
 tpu_inference/layers/jax/sample/sampling_metadata.py,sha256=Gd835LNWfGM0NRQBVBqEv0nPwt5q9F4AdFym0CUS1fw,2561
 tpu_inference/layers/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/layers/vllm/attention.py,sha256=JxEQ8ql_97zbQzukIbfUYq50-2k81VUG1Km_YV_RUtg,7363
+tpu_inference/layers/vllm/attention.py,sha256=wbJpcgqEAuIirv5PIULbiP-ggMKjmTanbB7Dg0BVYv4,7366
 tpu_inference/layers/vllm/fused_moe.py,sha256=XZt2CPUz00qZzDcyfBFz6buhVzmGL1amHalHJALl9zw,18945
 tpu_inference/layers/vllm/linear_common.py,sha256=_YlJtbdaYcck_j-gFLos_k0ycktVWxT8Qo57tR2YqJ8,7749
-tpu_inference/layers/vllm/sharding.py,sha256=WTx1tF_7R99AdyE-lL7HQJ378hAafeI-JVRsugAvwn4,9177
-tpu_inference/layers/vllm/quantization/__init__.py,sha256=Tz44kUZTdNFu5Dmu48aQ-9f7ioWjbUWS0eVYURXZ17E,1535
-tpu_inference/layers/vllm/quantization/awq.py,sha256=ar8x1CPTPvfcf4wbuBC1XVh4pjtSUchoYWnbkZKH3CQ,8412
+tpu_inference/layers/vllm/sharding.py,sha256=as7CF8UKTF3ToymwRY5Pi8uzwJk0P1sHPkWB5xEx3mA,9169
+tpu_inference/layers/vllm/quantization/__init__.py,sha256=SEppGayBzzQ5tsXLSy99aqilkAawQwYxnv2alCg6-ZU,1777
+tpu_inference/layers/vllm/quantization/awq.py,sha256=-8ZmjGvSKJB6_JuwSctNWt8xHWq4VSvK_AK9iahlgCo,8495
 tpu_inference/layers/vllm/quantization/common.py,sha256=wm3pge6XMTMsLK7_SSdgBP0PvQzz-1mrqN2I6xMqzrc,4218
-tpu_inference/layers/vllm/quantization/unquantized.py,sha256=id6d_IZIhDIvmaH3ANtmLiy4U_uY_AYAf4KTvfs3nmc,14900
+tpu_inference/layers/vllm/quantization/mxfp4.py,sha256=KwGoqIiPkd6FplGuYAKi4uX5A8MPlZqq99MVPchXyi4,11561
+tpu_inference/layers/vllm/quantization/unquantized.py,sha256=Q1v1ZbSIDmaoOg97Ehv6rA5CnSf6nTP40xDBMmHHeLw,15054
 tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py,sha256=uKaauZhaRDcMqd8_NyQoFs9BazMOFix3nIuutbLHHbU,5123
+tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py,sha256=6idEyy3e849fZ1UeNvc9eSHYX7e6qvohrJa_d_D9MBk,5285
 tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py,sha256=FM901QhyhJRC8CuMeICzCVVERvBHbhruRxYW0EQ570s,8820
 tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py,sha256=6sQvsxiWdi5Vte8V9vrQ2abaqGqWpq-mtzU7lGAo-ac,8759
 tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py,sha256=4y7lYgybpXszpCAtxGFhR8LDEbEoCCeo3DfUSOXxhaQ,5202
 tpu_inference/lora/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/lora/torch_lora_ops.py,sha256=pr3N7DVfkn3ANijUC6dBoiCtIJW4fdJpKdC3zWBUsxE,3121
-tpu_inference/lora/torch_punica_tpu.py,sha256=b27DpmIS_N5bhlIcryiENYNmPxp_cu40CGxjPW64d44,12706
-tpu_inference/mock/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/mock/vllm_config_utils.py,sha256=FlQshLjoHdgs3C66tYHYbKFUjbk9DhUwY-7HibZk0fI,878
-tpu_inference/mock/vllm_envs.py,sha256=cCubeOhH2WeYZQFJt6W0y_IiQo0fzIWR1LCCE8i6kI4,50990
-tpu_inference/mock/vllm_logger.py,sha256=vUGnN5nKT--ZvU15YCzODUM_FGiXKhcrrjDGjeN00RQ,7297
-tpu_inference/mock/vllm_logging_utils.py,sha256=TEUmKj3xHiLzHBnFqAujcxH0t2hBQ04sUaho2RyORnk,486
+tpu_inference/lora/torch_punica_tpu.py,sha256=qTnXZGLoOgvukSxeunO_SfpPTlkq9GlMj9H7zVYg9LE,12680
 tpu_inference/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/models/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/models/common/model_loader.py,sha256=AwukmGaUq2wv3OnFHUU-nwdAnKLG_eGw7PYY5CNrNNI,18225
+tpu_inference/models/common/model_loader.py,sha256=3rRntyGqS6l7yAfURmRaGkhyIaee2E43a5F0_i0IFmE,18177
 tpu_inference/models/jax/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/models/jax/deepseek_v3.py,sha256=SKOHVEC-_2NLxBnzBzbu5tu0d6FTlAEiI1EefGaO2QE,40047
 tpu_inference/models/jax/gpt_oss.py,sha256=Vw4LRB5Kp6hbA2hjZGFS8kiEqOCjf881XH2JNtu2S1I,20924
 tpu_inference/models/jax/jax_intermediate_tensor.py,sha256=Pxu1PCV5LN5X58aYVkPiohcXZIeKVim2oqvrS_cVgw4,2604
-tpu_inference/models/jax/llama3.py,sha256=YUG0S0Y6cy7PLcq0cpmDsGWbOZIhZzzyObRQdmUUxkg,13420
+tpu_inference/models/jax/llama3.py,sha256=ZiFtrpAzXTT9vAPES9UeuJInCWGbvDWs7g0_JLdCCa4,13479
 tpu_inference/models/jax/llama4.py,sha256=wf2Sp2iYViaYD5rSfv3_ryO6gYuYM5XaOyvghaP4OCY,29631
-tpu_inference/models/jax/llama_eagle3.py,sha256=STUkAK6XEA7JM3i_Lx36-t5BhkAGeW_xYiq3zYhHP1A,12297
-tpu_inference/models/jax/phi3.py,sha256=Oz68PE2Z1t8wTed95_w0KMIXfnfV72ZwXugNOdWOV5w,13576
-tpu_inference/models/jax/qwen2.py,sha256=RYb0hMKzPnFOAyhqbztoNlSrFIlRa74fYqSNecA2VOY,13354
-tpu_inference/models/jax/qwen2_5_vl.py,sha256=J4-AjeS_igJdxYCjTwS0HShiEfwQUMwrHxjlWvMw0ok,43939
-tpu_inference/models/jax/qwen3.py,sha256=SOL-Pvp56IrMxqXpPf5EFacBI6AJNlqf4Zrr1pkabGw,10994
+tpu_inference/models/jax/llama_eagle3.py,sha256=xUoNetxDbcFIEVLZ2DiD-GEQhHcdau2v1R12WdMyGec,12550
+tpu_inference/models/jax/llama_guard_4.py,sha256=LrnU2zBWM0s4q_5dwmR--OO0V7ttltsYhrHYlBgQVIw,15275
+tpu_inference/models/jax/qwen2.py,sha256=SuAp7tErk8OoIRko0Vt6QSOZP_9B9r5GTfqmVfImUIo,13410
+tpu_inference/models/jax/qwen2_5_vl.py,sha256=tf177ypgA1ZVIn34Ff_LTwr10NwzlZ3-DPqSoRLAQtQ,43995
+tpu_inference/models/jax/qwen3.py,sha256=CIZQKjZDke_LPGsLNhRCJdDTzWueUneBPAQ1blS24IM,11050
 tpu_inference/models/jax/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/models/jax/utils/file_utils.py,sha256=NOuSC3YFnZpf3CZgYdghbbiNYJt42zgjlEYbOZIVct4,2840
 tpu_inference/models/jax/utils/multi_modal_utils.py,sha256=rrIrQWidkUnGilBHKNpdYh7_2BkvnAaqanXjC81GNcg,6156
-tpu_inference/models/jax/utils/weight_utils.py,sha256=65-H8BTbyilIBMBfvWjkkW3mf4soYASbhrJFqbFKzL4,20129
+tpu_inference/models/jax/utils/weight_utils.py,sha256=d5u8pPR-qPbEjX-8BMY0Zea9O-a34CpfuDlVnbwWfAw,20659
 tpu_inference/models/jax/utils/quantization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/models/jax/utils/quantization/mxfp4_utils.py,sha256=boGnqJCRIOf5nedAxQ8_IUTV6Rfll10DXnRC40BeeE8,3682
 tpu_inference/models/jax/utils/quantization/quantization_utils.py,sha256=xgKoKB7AM3TYPxzVgEGLTK9ebQH2Kx8mNuO0heovkmk,26778
@@ -147,30 +145,30 @@ tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml,sha256=b7Sy
 tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml,sha256=0Qwij71zj9k6rmrUNd8Q5df9YYfkoJ1ZkgMAHxQy81k,128
 tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml,sha256=lGec0UwwxmNPNgKPSsTsCMSXNJjhw507KMtM2NsSCMw,152
 tpu_inference/models/vllm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/models/vllm/vllm_model_wrapper.py,sha256=ERxj-cm-pmYpT9eiL-E3OxeaQDEDrH_Vs0iUS9nCU9s,11424
+tpu_inference/models/vllm/vllm_model_wrapper.py,sha256=hEjg5hKotp-fEt3SXWkWpdnQ32TU1XGpTrfhyLTNyt0,12054
 tpu_inference/models/vllm/vllm_model_wrapper_context.py,sha256=yxlJHPmRQIAwlb1MmHK3xfXokgIkJ-evNU4PgyoJUdg,1187
 tpu_inference/platforms/__init__.py,sha256=lQCrKddS_GcGpCbeogvz9zOZD1mQw5bBsiw8On46qFQ,74
-tpu_inference/platforms/tpu_platform.py,sha256=bdo_zlRqrhccpaz6zOdH18cU8kq6tGKgR1xJJehsVrc,10131
+tpu_inference/platforms/tpu_platform.py,sha256=RSCe3Ne1FsWXVrX6_6V_Z6B0TDTRS38eM0KTkXbQ_w8,10579
 tpu_inference/runner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/runner/block_table.py,sha256=K3Ic8EgPM08d_C5nEN60mxoRydlaQWySAemf_8Q_qVw,4175
-tpu_inference/runner/compilation_manager.py,sha256=pJFFLkFVmhXukBIxGRUo-hrOqx8jl8JIUuS36fZ2yvg,36177
+tpu_inference/runner/compilation_manager.py,sha256=oVML1KhhQ7YFaSWBaJA0qWQoNX2qRZOrwbbh4XYPc-8,37287
 tpu_inference/runner/input_batch.py,sha256=bx221NX2IOWzrtopss-B-2ZKW4y-U6nQpG09PjpUziw,18273
-tpu_inference/runner/kv_cache.py,sha256=i54EbGQB-9bbOgk6KibTpJpTE2pfFuTfis7J1P_UB0M,4574
-tpu_inference/runner/kv_cache_manager.py,sha256=CJxXtdWuewJqcTBMoR70_Uvwxjtc3cK2jxe1KpI9kQc,22152
+tpu_inference/runner/kv_cache.py,sha256=F4dzW2d53xuxkFUn0oKzwE6VklGUeVm-QM19NVfIQDU,4577
+tpu_inference/runner/kv_cache_manager.py,sha256=XEfis_9nQAz8uxM5y_P5biqSUijX4IeMhIusTf2V7vg,22444
 tpu_inference/runner/lora_utils.py,sha256=B4xMCgXGJ4VNdePvn89HH3tIZ-gYsQ7Vq_YCiYIATEY,3843
 tpu_inference/runner/multimodal_manager.py,sha256=azEPdHOwz8CN11MQmorGdtrCLbFaTCxdWyuEsZTzjYM,9778
 tpu_inference/runner/persistent_batch_manager.py,sha256=KERSfKy6XjMejnbtPGI3hzoYAHJLeCxmpZVYPqBCago,11156
 tpu_inference/runner/speculative_decoding_manager.py,sha256=I3FDWKh2dn6nV8LgTGfCTwMKYnxQsTPpBIrmaJngXHs,10215
 tpu_inference/runner/structured_decoding_manager.py,sha256=Y0ERPhj4olFh6Y2TxP0R1_4UIJwy7nemYA-h63YIR2U,3622
-tpu_inference/runner/tpu_runner.py,sha256=5vPFey3KFnh5lczyj4cIT3mVhR8RuX8kbcuHVOg8DAg,72318
+tpu_inference/runner/tpu_runner.py,sha256=aHXHSlaNuc9q7pcPklqTFRkmkEQDULEEH_hsR_NcTMQ,77532
 tpu_inference/runner/utils.py,sha256=ZnWUoNo-7INeB0mdXti1jwUOdbmxyExznOs-crRTQLk,17126
 tpu_inference/spec_decode/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 tpu_inference/spec_decode/jax/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/spec_decode/jax/eagle3.py,sha256=A1dt-dmBttpy-5DGcL4noEDCB0OGP8Xo6MXqgJvWIo8,16593
+tpu_inference/spec_decode/jax/eagle3.py,sha256=1WVHTdv6jfCKwbiz0RwQLPyq8L720gD_bs0p_Gz0QiI,16644
 tpu_inference/worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tpu_inference/worker/tpu_worker.py,sha256=KY7fH--NP7jiTduP5m0gDnmB2LbhIel0Ts37XmjYpPM,14207
-tpu_inference-0.11.1.dev202511130813.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-tpu_inference-0.11.1.dev202511130813.dist-info/METADATA,sha256=LARdH4AAJfZrrU2Pj4EIN8Zl0QLjzEpzkRCqBbeUdT8,5465
-tpu_inference-0.11.1.dev202511130813.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-tpu_inference-0.11.1.dev202511130813.dist-info/top_level.txt,sha256=gb1hRIQ3DOawUfVzvPL2E__2KPIl9I0vb5r0xcRBGYQ,20
-tpu_inference-0.11.1.dev202511130813.dist-info/RECORD,,
+tpu_inference/worker/tpu_worker.py,sha256=aojB9-PY_ZzTaZgv1i5PUB9CSXNVuK4JZzftCv9ku4A,20642
+tpu_inference-0.11.1.dev202511220812.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+tpu_inference-0.11.1.dev202511220812.dist-info/METADATA,sha256=JzmyOlYYkImIe_WSawI0LDwL28xS-0SCRCcFXeYSV0g,5465
+tpu_inference-0.11.1.dev202511220812.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+tpu_inference-0.11.1.dev202511220812.dist-info/top_level.txt,sha256=gb1hRIQ3DOawUfVzvPL2E__2KPIl9I0vb5r0xcRBGYQ,20
+tpu_inference-0.11.1.dev202511220812.dist-info/RECORD,,

tpu_inference/mock/__init__.py DELETED Viewed

File without changes

tpu_inference/mock/vllm_config_utils.py DELETED Viewed

@@ -1,28 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Any, List, Mapping
-@dataclass
-class ModelConfig():
-    max_model_len: int = 2048
-    max_prefill_len: int = 1024
-    prefill_batch_size: int = 1
-    decode_batch_size: int = 1
-    block_size: int = 16
-    num_layers: int = 32
-    num_kv_heads: int = 32
-    head_dim: int = 128
-    vocab_size: int = 32000
-    model: str = "llama3"
-    hf_config: str = ""
-    architectures: List[str] = field(default_factory=list)
-    override_generation_config: dict[str, Any] = field(default_factory=dict)
-    hf_overrides: dict[str, Any] = field(default_factory=dict)
-@dataclass
-class VllmConfig():
-    additional_config: Mapping[str, Any] = field(default_factory=dict)
-    # Set default max_model_len to turn off warnings.
-    model_config: ModelConfig = field(
-        default_factory=lambda: ModelConfig(max_model_len=1024))

tpu-inference 0.11.1.dev202511130813__py3-none-any.whl → 0.11.1.dev202511220812__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511130813py3-none-any.whl → 0.11.1.dev202511220812py3-none-any.whl