PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (67) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_utils.py +16 -24
tpu_inference/__init__.py +3 -22
tpu_inference/core/core_tpu.py +9 -17
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +11 -31
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +143 -287
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -7
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/{common → jax}/attention_interface.py +2 -8
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/{common → jax}/sharding.py +5 -5
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/__init__.py +3 -7
tpu_inference/layers/vllm/quantization/awq.py +3 -4
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +2 -4
tpu_inference/layers/vllm/quantization/unquantized.py +67 -62
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +12 -46
tpu_inference/models/jax/llama3.py +3 -4
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +2 -3
tpu_inference/models/jax/qwen2_5_vl.py +50 -165
tpu_inference/models/jax/qwen3.py +2 -3
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -32
tpu_inference/platforms/tpu_platform.py +34 -47
tpu_inference/runner/compilation_manager.py +60 -145
tpu_inference/runner/kv_cache.py +2 -2
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +135 -283
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +15 -38
tpu_inference/worker/tpu_worker.py +26 -163
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/RECORD +63 -61
tests/test_envs.py +0 -203
tpu_inference/layers/common/quant_methods.py +0 -8
tpu_inference/layers/vllm/quantization/mxfp4.py +0 -331
tpu_inference/models/jax/llama_guard_4.py +0 -361
/tpu_inference/layers/{common → jax}/binary_search.py +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/top_level.txt +0 -0

tpu_inference/runner/kv_cache_manager.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import functools
+import math
 from typing import TYPE_CHECKING, Dict, List
 import jax
 import jax.numpy as jnp
-import numpy as np
 import vllm.envs as envs
 from jax.sharding import NamedSharding, PartitionSpec
 from torchax.ops.mappings import t2j_dtype
+from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import Attention
 from vllm.config import get_layers_from_vllm_config
-from vllm.utils.math_utils import cdiv
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec, MLAAttentionSpec,
                                         SlidingWindowSpec)
@@ -176,11 +175,6 @@ class KVCacheManager:
             )
             self.runner.input_batch = new_input_batch
             self.runner.persistent_batch_manager.input_batch = new_input_batch
-            self.runner.block_tables_cpu = [
-                np.zeros((self.runner.max_num_reqs,
-                          cdiv(self.runner.max_model_len, block_size)),
-                         dtype=np.int32) for block_size in block_sizes
-            ]
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         self.maybe_reinitialize_input_batch(kv_cache_config)
@@ -196,7 +190,7 @@ class KVCacheManager:
             num_blocks = kv_cache_tensor.size // page_size_bytes
             dp_size = self.runner.vllm_config.sharding_config.total_dp_size
             # num_blocks must be a multiple of dp_size
-            num_blocks = (num_blocks // dp_size) * dp_size
+            num_blocks = math.ceil(num_blocks / dp_size) * dp_size
             # NOTE: we'll multiply the num_kv_heads by 2 in the function
             kv_cache = create_kv_caches(
                 num_blocks=num_blocks,
@@ -289,8 +283,13 @@ class KVCacheManager:
         def _update_layer(cache, slices):
             """The function to apply to each layer's cache and slices."""
-            reshaped_slices = slices.reshape(-1, block_size, *slices.shape[1:])
-            cache.at[block_numbers].set(reshaped_slices)
+            reshaped_slices = slices.reshape(-1, 1, block_size,
+                                             *slices.shape[1:])
+            for (i, block_idx) in enumerate(block_numbers):
+                cache = jax.lax.dynamic_update_slice_in_dim(cache,
+                                                            reshaped_slices[i],
+                                                            block_idx,
+                                                            axis=0)
             return cache
         return jax.tree.map(_update_layer, kv_caches, kv_cache_slices)
@@ -343,12 +342,16 @@ class KVCacheManager:
         """
         if block_ids == list(range(block_ids[0],
                                    block_ids[0] + len(block_ids))):
-            batched_kv_cache_per_layer = self._jitted_gather_continuous_kv_cache(
-                self.runner.kv_caches, block_ids[0], len(block_ids))
+            with runner_utils.LatencyTracker(
+                    "BatchedGatherKVSlices-for-blocks"):
+                batched_kv_cache_per_layer = self._jitted_gather_continuous_kv_cache(
+                    self.runner.kv_caches, block_ids[0], len(block_ids))
         else:
-            batched_kv_cache_per_layer = self._jitted_gather_kv_cache(
-                self.runner.kv_caches, jnp.array(block_ids))
+            with runner_utils.LatencyTracker(
+                    "BatchedGatherKVSlices-for-blocks"):
+                batched_kv_cache_per_layer = self._jitted_gather_kv_cache(
+                    self.runner.kv_caches, jnp.array(block_ids))
         return batched_kv_cache_per_layer
     def transfer_kv_cache(self,
@@ -437,7 +440,6 @@ class KVCacheManager:
                     kv_cache_slices,
                     start_block,
                 )
-                jax.block_until_ready(self.runner.kv_caches)
         else:
             with runner_utils.LatencyTracker(
                     f"JittedInsertKVCache-b{len(block_numbers)}"):
@@ -449,7 +451,6 @@ class KVCacheManager:
                     kv_cache_slices,
                     jnp.array(block_numbers),
                 )
-                jax.block_until_ready(self.runner.kv_caches)
         logger.debug(
             f"Updated kv cache entries cnt={len(self.runner.kv_caches)}")

tpu_inference/runner/persistent_batch_manager.py CHANGED Viewed

@@ -14,13 +14,12 @@ class PersistentBatchManager:
     def __init__(self, requests: Dict[str, CachedRequestState],
                  input_batch: InputBatch, encoder_cache: Dict[str,
                                                               'jax.Array'],
-                 uses_mrope: bool, model_config, is_last_rank: bool):
+                 uses_mrope: bool, model_config):
         self.requests = requests
         self.input_batch = input_batch
         self.encoder_cache = encoder_cache
         self.uses_mrope = uses_mrope
         self.model_config = model_config
-        self.is_last_rank = is_last_rank
     def _reorder_batch(self, scheduler_output: "VllmSchedulerOutput") -> int:
         """ Reorder the sheduled requests to RPA kernel friendly distribution
@@ -180,35 +179,9 @@ class PersistentBatchManager:
             num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_data.resumed_from_preemption[i]
-            num_output_tokens = req_data.num_output_tokens[i]
             # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
-            req_index = self.input_batch.req_id_to_index.get(req_id)
-            if not self.is_last_rank:
-                # When using PP, the scheduler sends the sampled tokens back,
-                # because there's no direct communication between the first-
-                # stage worker and the last-stage worker.
-                new_token_ids = req_data.new_token_ids[i]
-                # Add the sampled token(s) from the previous step (if any).
-                # This doesn't include "unverified" tokens like spec tokens.
-                num_new_tokens = (num_computed_tokens + len(new_token_ids) -
-                                  req_state.num_tokens)
-                if num_new_tokens == 1:
-                    req_state.output_token_ids.append(new_token_ids[-1])
-                elif num_new_tokens > 0:
-                    req_state.output_token_ids.extend(
-                        new_token_ids[-num_new_tokens:])
-            elif num_output_tokens < len(req_state.output_token_ids):
-                del req_state.output_token_ids[num_output_tokens:]
-                if req_index is not None:
-                    end_idx = (self.input_batch.num_prompt_tokens[req_index] +
-                               num_output_tokens)
-                    self.input_batch.num_tokens[req_index] = end_idx
-                    self.input_batch.num_tokens_no_spec[req_index] = end_idx
-            # Update the block IDs.
             if not resumed_from_preemption:
                 if new_block_ids is not None:
                     # Append the new blocks to the existing block IDs.
@@ -221,6 +194,7 @@ class PersistentBatchManager:
                 # Replace the existing block IDs with the new ones.
                 req_state.block_ids = new_block_ids
+            req_index = self.input_batch.req_id_to_index.get(req_id)
             if req_index is None:
                 # The request is not in the persistent batch.
                 # The request was either preempted and resumed later, or was not
@@ -235,18 +209,6 @@ class PersistentBatchManager:
                 self.input_batch.block_table.append_row(
                     new_block_ids, req_index)
-            # For the last rank, we don't need to update the token_ids_cpu
-            # because the sampled tokens are already cached.
-            if not self.is_last_rank:
-                start_token_index = num_computed_tokens
-                end_token_index = num_computed_tokens + len(new_token_ids)
-                self.input_batch.token_ids_cpu[
-                    req_index,
-                    start_token_index:end_token_index] = new_token_ids
-                self.input_batch.num_tokens_no_spec[
-                    req_index] = end_token_index
-                self.input_batch.num_tokens[req_index] = end_token_index
             # Add spec_token_ids to token_ids_cpu.
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
                 req_id, ())

tpu_inference/runner/structured_decoding_manager.py CHANGED Viewed

@@ -61,10 +61,11 @@ class StructuredDecodingManager:
         self.runner.require_structured_out_cpu.fill(0)
         sorted_struct_requests = sorted(
-            grammar_output.structured_output_request_ids)
+            grammar_output.structured_output_request_ids.items(),
+            key=lambda item: item[1])
         cumulative_mask_idx = 0
-        for req_id in sorted_struct_requests:
+        for req_id, _ in sorted_struct_requests:
             if req_id not in self.runner.input_batch.req_id_to_index:
                 continue
             batch_index = self.runner.input_batch.req_id_to_index[req_id]

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl