PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (248) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +14 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +14 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +180 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +25 -8
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +14 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +20 -3
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +171 -163
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +20 -26
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +112 -69
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +85 -65
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +374 -194
tpu_inference/kernels/ragged_paged_attention/v3/util.py +13 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +22 -3
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +52 -27
tpu_inference/layers/jax/attention/gpt_oss_attention.py +19 -6
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +100 -455
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +19 -5
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +119 -132
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +38 -26
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +133 -220
tpu_inference/layers/vllm/quantization/unquantized.py +154 -253
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +37 -16
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +113 -124
tpu_inference/models/jax/gpt_oss.py +23 -7
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +85 -24
tpu_inference/models/jax/utils/weight_utils.py +32 -1
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -4
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +27 -29
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +69 -35
tpu_inference/runner/kv_cache.py +14 -0
tpu_inference/runner/kv_cache_manager.py +15 -2
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +30 -10
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +31 -30
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +23 -7
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +1 -1
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -208
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.12.0.dev20251213.dist-info/RECORD +0 -175
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/runner/kv_cache.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Any, List
 import jax

tpu_inference/runner/kv_cache_manager.py CHANGED Viewed

@@ -1,5 +1,19 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import functools
-from typing import TYPE_CHECKING, Dict, List
+from typing import TYPE_CHECKING, List
 import jax
 import jax.numpy as jnp
@@ -198,7 +212,6 @@ class KVCacheManager:
         # uniform page size.
         representative_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
         page_size_bytes = representative_spec.page_size_bytes
-        self.runner.layer_name_to_kvcache_index: Dict[str, int] = {}
         kv_caches = self.runner.kv_caches
         num_blocks_list = []
         for i, kv_cache_tensor in enumerate(kv_cache_config.kv_cache_tensors):

tpu_inference/runner/lora_utils.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from __future__ import annotations
 from typing import TYPE_CHECKING
@@ -7,7 +21,8 @@ from torchax.interop import jax_view
 from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
 from vllm.lora.request import LoRARequest
-from tpu_inference.layers.vllm.sharding import update_lora
+from tpu_inference.layers.vllm.process_weights.cleanup_sharding import \
+    update_lora
 if TYPE_CHECKING:
     from tpu_inference.runner.tpu_runner import TPUModelRunner

tpu_inference/runner/multimodal_manager.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import TYPE_CHECKING
 import jax
@@ -98,7 +112,7 @@ class MultiModalManager:
         # encoder outputs.
         encoder_outputs = []
         for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
-                mm_kwargs, merge_by_field_config=False):
+                mm_kwargs):
             batched_mm_inputs = mm_kwargs_group
             # Convert torch tensors to numpy arrays that JAX can handle.
             if "pixel_values" in batched_mm_inputs and isinstance(
@@ -134,7 +148,7 @@ class MultiModalManager:
             # 2. A list or tuple (length: num_items) of tensors, each of shape
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
-            curr_group_outputs = self.runner.get_multimodal_embeddings_fn(
+            curr_group_outputs = self.runner.embed_multimodal_fn(
                 self.runner.state, image_grid_thw, **batched_mm_inputs)
             sanity_check_mm_encoder_outputs(

tpu_inference/runner/persistent_batch_manager.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Dict
 import jax

tpu_inference/runner/speculative_decoding_manager.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from __future__ import annotations
 from dataclasses import dataclass

tpu_inference/runner/structured_decoding_manager.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import functools
 from typing import TYPE_CHECKING, Tuple

tpu_inference/runner/tpu_runner.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import copy
 import functools
 import logging
@@ -268,6 +282,9 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         self._substitute_placeholder_token_fn = _substitute_placeholder_token
         self.execute_model_state: ExecuteModelState | None = None
+        self.kv_caches: list[jax.Array] = []
+        self.layer_name_to_kvcache_index: dict[str, int] = {}
     def _init_random(self):
         if self.model_config.seed is None:
             self.model_config.seed = 0
@@ -494,10 +511,10 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         multimodal_fns = multimodal_fns or {}
         self.precompile_vision_encoder_fn = multimodal_fns.get(
             "precompile_vision_encoder_fn", None)
-        self.get_multimodal_embeddings_fn = multimodal_fns.get(
-            "get_multimodal_embeddings_fn", None)
-        self.get_input_embeddings_fn = multimodal_fns.get(
-            "get_input_embeddings_fn", None)
+        self.embed_multimodal_fn = multimodal_fns.get("embed_multimodal_fn",
+                                                      None)
+        self.embed_input_ids_fn = multimodal_fns.get("embed_input_ids_fn",
+                                                     None)
         self.get_mrope_input_positions_fn = multimodal_fns.get(
             "get_mrope_input_positions_fn", None)
@@ -509,7 +526,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
             jax.random.key(self.model_config.seed)).params()
         self.is_multimodal_model = (
             self.model_config.is_multimodal_model
-            and self.get_multimodal_embeddings_fn is not None and hasattr(
+            and self.embed_multimodal_fn is not None and hasattr(
                 self.model_config.hf_config, "architectures"
             )  #TODO: Remove Llama Guard 4 specific condition once the LG4 Vision portion is implemented
             and len(self.model_config.hf_config.architectures) >= 1
@@ -525,10 +542,12 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
     def get_kv_cache_spec(self):
         return self.kv_cache_manager.get_kv_cache_spec()
-    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+    def initialize_kv_cache(self,
+                            kv_cache_config: KVCacheConfig,
+                            topology_order_id: int = 0) -> None:
+        self.topology_order_id = topology_order_id
         self.kv_cache_config = kv_cache_config
         self.use_hybrid_kvcache = len(kv_cache_config.kv_cache_groups) > 1
-        self.kv_caches = []
         self.kv_cache_manager.initialize_kv_cache(kv_cache_config)
         if has_kv_transfer_group():
             get_kv_transfer_group().register_runner(self)
@@ -810,7 +829,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
         sharding = None
         if self.dp_size > 1:
             sharding = NamedSharding(self.mesh,
-                                     PartitionSpec(ShardingAxisName.ATTN_DATA))
+                                     PartitionSpec(ShardingAxisName.MLP_DATA))
         tpu_sampling_metadata = TPUSupportedSamplingMetadata.from_input_batch(
             self.mesh, self.input_batch, padded_num_reqs, sharding=sharding)
@@ -1373,7 +1392,8 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
             self.mesh,
             self.input_batch,
             padded_num_reqs,
-            sharding=data_parallel_attn_sharding,
+            sharding=NamedSharding(self.mesh,
+                                   PartitionSpec(ShardingAxisName.MLP_DATA)),
         )
         if self.uses_mrope:
             positions = mrope_positions
@@ -1663,7 +1683,7 @@ class TPUModelRunner(KVConnectorModelRunnerMixin, LoRAModelRunnerMixin):
     def _get_input_ids_embeds(self, input_ids: jax.Array,
                               mm_embeds: list[jax.Array]):
         if self.is_multimodal_model:
-            inputs_embeds = self.get_input_embeddings_fn(
+            inputs_embeds = self.embed_input_ids_fn(
                 self.state,
                 input_ids,
                 mm_embeds,

tpu_inference/spec_decode/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/spec_decode/jax/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/spec_decode/jax/eagle3.py CHANGED Viewed

@@ -1,3 +1,16 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Implements the Eagle3 proposer for speculative decoding on JAX/TPU."""
 import functools
 from dataclasses import replace

tpu_inference/tpu_info.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import glob
 import os

tpu_inference/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import time
 from collections import defaultdict
 from collections.abc import Sequence
 from functools import wraps
-from typing import Any, Callable, List, Tuple
+from typing import Any, Callable, List, Tuple, Union
 import jax
 import jax.numpy as jnp
@@ -283,35 +283,6 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
     return utils.hashing.get_hash_fn_by_name(hash_fn_name)
-def quantize_kv(key: jax.Array, value: jax.Array,
-                kv_cache_quantized_dtype: jnp.dtype, k_scale: float,
-                v_scale: float) -> Tuple[jax.Array, jax.Array]:
-    """
-        Quantize the key and value tensors.
-        Args:
-            key: The key tensor to quantize.
-            value: The value tensor to quantize.
-            kv_cache_quantized_dtype: The dtype to quantize the key and value tensors to.
-            q_scale: The scale to quantize the key and value tensors by.
-            k_scale: The scale to quantize the key tensor by.
-            v_scale: The scale to quantize the value tensor by.
-        Returns:
-            Tuple[jax.Array, jax.Array]: The quantized key and value tensors.
-        """
-    dtype_info = jnp.finfo(kv_cache_quantized_dtype)
-    minval, maxval = float(dtype_info.min), float(dtype_info.max)
-    key = key.astype(jnp.float32) / k_scale
-    key = jnp.clip(key, minval, maxval)
-    key = key.astype(kv_cache_quantized_dtype)
-    value = value.astype(jnp.float32) / v_scale
-    value = jnp.clip(value, minval, maxval)
-    value = value.astype(kv_cache_quantized_dtype)
-    return key, value
 def get_jax_dtype_from_str_dtype(str_dtype: str) -> jnp.dtype:
     """
     Get the JAX dtype from a string dtype.
@@ -326,6 +297,36 @@ def get_jax_dtype_from_str_dtype(str_dtype: str) -> jnp.dtype:
     return to_jax_dtype(str_dtype)
+def get_mesh_shape_product(
+    mesh: Mesh,
+    axes: Union[str, list[str], None],
+) -> int:
+    """
+    Get the product of mesh dimensions for one or more axes.
+    Examples:
+        # Single axis (defaults to 1 if not present)
+        get_mesh_shape_product(mesh, "model")
+        # Multiple axes - computes product of their sizes
+        get_mesh_shape_product(mesh, ["model", "attn_dp"])
+        # None means no sharding on this dimension
+        get_mesh_shape_product(mesh, None)  # returns 1
+    """
+    if axes is None:
+        return 1
+    if isinstance(axes, str):
+        axes = [axes]
+    product = 1
+    for axis in axes:
+        product *= mesh.shape.get(axis, 1)
+    return product
 def time_function(func):
     """
     A decorator to measure the execution time of a function.

tpu_inference/worker/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/worker/tpu_worker.py CHANGED Viewed

@@ -26,8 +26,8 @@ from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from tpu_inference import envs, utils
 from tpu_inference.distributed import jax_parallel_state
-from tpu_inference.distributed.utils import (get_host_ip, get_kv_transfer_port,
-                                             get_node_id)
+from tpu_inference.distributed.utils import (get_device_topology_order_id,
+                                             get_host_ip, get_kv_transfer_port)
 from tpu_inference.layers.common.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.jax_intermediate_tensor import \
@@ -232,9 +232,16 @@ class TPUWorker:
         is_first_rank = True
         is_last_rank = True
+        self.topology_order_id = self.rank
         if self.parallel_config.pipeline_parallel_size > 1:
             is_first_rank = self.rank == 0
             is_last_rank = self.rank == self.pp_config.pp_world_size - 1
+        else:
+            # topology_order_id is used to determine the KV cache
+            # mapping between P/D workers
+            if multihost_backend == "ray":
+                self.topology_order_id = get_device_topology_order_id(
+                    jax.local_devices(), jax.devices())
         self.model_runner = TPUModelRunner(self.vllm_config, self.devices,
                                            self.rank, is_first_rank,
@@ -243,9 +250,12 @@ class TPUWorker:
                     f"rank={self.rank} | "
                     f"is_first_rank={is_first_rank} | "
                     f"is_last_rank={is_last_rank} | "
-                    f"node_id={get_node_id()} | "
+                    f"topology_order_id={self.topology_order_id} | "
                     f"is_driver_worker={self.is_driver_worker} | "
-                    f"hbm={utils.hbm_usage_gb(self.devices)}GiB")
+                    f"hbm={utils.hbm_usage_gb(self.devices)}GiB |"
+                    f"self.devices={self.devices} | "
+                    f"total devices={jax.devices()} | "
+                    f"local_devices={jax.local_devices()}")
         vllm_utils.report_usage_stats(self.vllm_config)
     def initialize_pp_transfer_connect(self):
@@ -420,13 +430,19 @@ class TPUWorker:
         kv_cache_config: KVCacheConfig,
     ) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
-        self.model_runner.initialize_kv_cache(kv_cache_config)
+        # Precompile functions with large vocab_size tensors before allocating KV cache to avoid OOM
+        if not (envs.SKIP_JAX_PRECOMPILE or
+                (hasattr(self.model_runner.model_config, "enforce_eager")
+                 and self.model_runner.model_config.enforce_eager)):
+            self.model_runner.compilation_manager._precompile_sampling()
+            self.model_runner.compilation_manager._precompile_gather_logprobs()
+        self.model_runner.initialize_kv_cache(kv_cache_config,
+                                              self.topology_order_id)
     def get_node_kv_ip_port(self) -> tuple[int, str, int]:
-        node_id = get_node_id()
         ip = get_host_ip()
         port = get_kv_transfer_port()
-        return (int(node_id), ip, int(port))
+        return (int(self.topology_order_id), ip, int(port))
     def check_health(self) -> None:
         # worker will always be healthy as long as it's running.

{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tpu_inference
-Version: 0.12.0.dev20251213
+Version: 0.13.2.dev20251230
 Author: tpu_inference Contributors
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers

tpu-inference 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl