PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (257) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +317 -34
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +26 -6
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +25 -4
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +807 -230
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +218 -137
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +25 -12
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +32 -9
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +101 -494
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +23 -8
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +172 -176
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +42 -25
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -178
tpu_inference/layers/vllm/quantization/unquantized.py +157 -233
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +112 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +18 -5
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +179 -51
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +92 -32
tpu_inference/models/jax/utils/weight_utils.py +234 -155
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +32 -8
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +51 -72
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +180 -80
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +55 -33
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +16 -3
tpu_inference/runner/tpu_runner.py +124 -61
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +84 -22
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +66 -52
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +8 -9
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -186
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511220812.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/distributed/utils.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from vllm.utils.network_utils import get_ip
@@ -54,7 +68,45 @@ def get_side_channel_port() -> str:
     return port
-def get_node_id() -> int:
-    # TODO(xiang): Is it possible to get this from a pre-defiend env?
-    id = os.getenv("TPU_NODE_ID", 0)
-    return int(id)
+def get_device_topology_order_id(local_devices, global_devices) -> int:
+    """
+    Calculates the topology order ID for the local device set within the global topology.
+    This function determines the rank of the current host/process based on the
+    coordinate of its TPU devices relative to all devices in the topology.
+    Args:
+        local_devices: A list of TpuDevice objects available to the current process.
+        global_devices: A list of all TpuDevice objects in the global topology.
+    Returns:
+        The topology order ID (rank) of the local devices.
+    """
+    if not local_devices:
+        raise ValueError("local_devices cannot be empty")
+    if not global_devices:
+        raise ValueError("global_devices cannot be empty")
+    # 1. Find the 'anchor' (minimum coordinate) for the local devices.
+    #    This represents the physical top-left corner of the local machine.
+    local_anchor = min(d.coords for d in local_devices)
+    # 2. Group global devices by process to find the anchor for EVERY process.
+    process_anchors = {}
+    for d in global_devices:
+        pid = d.process_index
+        # Update the minimum coordinate found for this process so far
+        if pid not in process_anchors or d.coords < process_anchors[pid]:
+            process_anchors[pid] = d.coords
+    # 3. Sort the unique anchors to establish the canonical topology order.
+    #    Tuples (x, y, z) sort lexicographically (x first, then y, then z).
+    sorted_anchors = sorted(process_anchors.values())
+    # 4. Return the index (rank) of the local anchor in the sorted list.
+    try:
+        return sorted_anchors.index(local_anchor)
+    except ValueError:
+        raise ValueError(
+            f"Local devices: {local_devices} do not exist in the global device: {global_devices} list."
+        )

tpu_inference/envs.py CHANGED Viewed

@@ -15,13 +15,88 @@ if TYPE_CHECKING:
     PREFILL_SLICES: str = ""
     DECODE_SLICES: str = ""
     SKIP_JAX_PRECOMPILE: bool = False
-    MODEL_IMPL_TYPE: str = "flax_nnx"
+    VLLM_XLA_CHECK_RECOMPILATION: bool = False
+    MODEL_IMPL_TYPE: str = "auto"
     NEW_MODEL_DESIGN: bool = False
     PHASED_PROFILING_DIR: str = ""
     PYTHON_TRACER_LEVEL: int = 1
     USE_MOE_EP_KERNEL: bool = False
+    NUM_SLICES: int = 1
     RAY_USAGE_STATS_ENABLED: str = "0"
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "shm"
+    ENABLE_QUANTIZED_MATMUL_KERNEL: bool = False
+def env_with_choices(
+    env_name: str,
+    default: str | None,
+    choices: list[str] | Callable[[], list[str]],
+    case_sensitive: bool = True,
+) -> Callable[[], str | None]:
+    """
+    Create a lambda that validates environment variable against allowed choices
+    Args:
+        env_name: Name of the environment variable
+        default: Default value if not set (can be None)
+        choices: List of valid string options or callable that returns list
+        case_sensitive: Whether validation should be case sensitive
+    Returns:
+        Lambda function for environment_variables dict
+    """
+    def _get_validated_env() -> str | None:
+        value = os.getenv(env_name)
+        if value is None:
+            return default
+        # Resolve choices if it's a callable (for lazy loading)
+        actual_choices = choices() if callable(choices) else choices
+        if not case_sensitive:
+            check_value = value.lower()
+            check_choices = [choice.lower() for choice in actual_choices]
+        else:
+            check_value = value
+            check_choices = actual_choices
+        if check_value not in check_choices:
+            raise ValueError(f"Invalid value '{value}' for {env_name}. "
+                             f"Valid options: {actual_choices}.")
+        return value
+    return _get_validated_env
+def env_bool(env_name: str, default: bool = False) -> Callable[[], bool]:
+    """
+    Accepts both numeric strings ("0", "1") and boolean strings
+    ("true", "false", "True", "False").
+    Args:
+        env_name: Name of the environment variable
+        default: Default boolean value if not set
+    """
+    def _get_bool_env() -> bool:
+        value = os.getenv(env_name)
+        if value is None or value == "":
+            return default
+        value_lower = value.lower()
+        if value_lower in ("true", "1"):
+            return True
+        elif value_lower in ("false", "0"):
+            return False
+        else:
+            raise ValueError(
+                f"Invalid boolean value '{value}' for {env_name}. "
+                f"Valid options: '0', '1', 'true', 'false', 'True', 'False'.")
+    return _get_bool_env
 environment_variables: dict[str, Callable[[], Any]] = {
     # JAX platform selection (e.g., "tpu", "cpu", "proxy")
@@ -38,7 +113,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("TPU_WORKER_ID", None),
     # Backend for multi-host communication on TPU
     "TPU_MULTIHOST_BACKEND":
-    lambda: os.getenv("TPU_MULTIHOST_BACKEND", "").lower(),
+    env_with_choices("TPU_MULTIHOST_BACKEND", "", ["ray"]),
     # Slice configuration for disaggregated prefill workers
     "PREFILL_SLICES":
     lambda: os.getenv("PREFILL_SLICES", ""),
@@ -47,28 +122,37 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("DECODE_SLICES", ""),
     # Skip JAX precompilation step during initialization
     "SKIP_JAX_PRECOMPILE":
-    lambda: bool(int(os.getenv("SKIP_JAX_PRECOMPILE", "0"))),
+    env_bool("SKIP_JAX_PRECOMPILE", default=False),
+    # Check for XLA recompilation during execution
+    "VLLM_XLA_CHECK_RECOMPILATION":
+    env_bool("VLLM_XLA_CHECK_RECOMPILATION", default=False),
     # Model implementation type (e.g., "flax_nnx")
     "MODEL_IMPL_TYPE":
-    lambda: os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower(),
+    env_with_choices("MODEL_IMPL_TYPE", "auto",
+                     ["auto", "vllm", "flax_nnx", "jetpack"]),
     # Enable new experimental model design
     "NEW_MODEL_DESIGN":
-    lambda: bool(int(os.getenv("NEW_MODEL_DESIGN", "0"))),
+    env_bool("NEW_MODEL_DESIGN", default=False),
     # Directory to store phased profiling output
     "PHASED_PROFILING_DIR":
     lambda: os.getenv("PHASED_PROFILING_DIR", ""),
     # Python tracer level for profiling
     "PYTHON_TRACER_LEVEL":
-    lambda: int(os.getenv("PYTHON_TRACER_LEVEL", "1")),
+    lambda: int(os.getenv("PYTHON_TRACER_LEVEL") or "1"),
     # Use custom expert-parallel kernel for MoE (Mixture of Experts)
     "USE_MOE_EP_KERNEL":
-    lambda: bool(int(os.getenv("USE_MOE_EP_KERNEL", "0"))),
+    env_bool("USE_MOE_EP_KERNEL", default=False),
+    # Number of TPU slices for multi-slice mesh
+    "NUM_SLICES":
+    lambda: int(os.getenv("NUM_SLICES") or "1"),
     # Enable/disable Ray usage statistics collection
     "RAY_USAGE_STATS_ENABLED":
     lambda: os.getenv("RAY_USAGE_STATS_ENABLED", "0"),
     # Ray compiled DAG channel type for TPU
     "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
-    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm"),
+    env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm", ["shm"]),
+    "ENABLE_QUANTIZED_MATMUL_KERNEL":
+    lambda: bool(int(os.getenv("ENABLE_QUANTIZED_MATMUL_KERNEL") or "0")),
 }

tpu_inference/executors/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/executors/ray_distributed_executor.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from array import array
 from typing import Any, Dict, List, Optional
@@ -6,7 +20,7 @@ import ray
 import vllm.envs as envs
 from ray.util.placement_group import PlacementGroup
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargsItem
 from vllm.platforms import current_platform
 from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
@@ -39,7 +53,7 @@ logger = init_logger(__name__)
 def _encode_hook(obj: Any) -> Any:
-    """Custom msgspec enc hook that supports array types and MultiModalKwargs.
+    """Custom msgspec enc hook that supports array types and MultiModalKwargsItem.
     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
     """
@@ -48,7 +62,7 @@ def _encode_hook(obj: Any) -> Any:
             f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
             f"Given array has a type code of {obj.typecode}.")
         return obj.tobytes()
-    if isinstance(obj, MultiModalKwargs):
+    if isinstance(obj, MultiModalKwargsItem):
         return dict(obj)
@@ -136,11 +150,18 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
         pp_size = self.parallel_config.pipeline_parallel_size
         placement_group_specs: List[Dict[str, float]] = []
+        ray_nodes = ray.nodes()
+        logger.info(f"RayDistributedExecutor | ray_nodes={ray_nodes}")
         if pp_size == 1:
             placement_group_specs = [{
                 device_str: node['Resources'][device_str]
-            } for node in ray.nodes()]
+            } for node in ray_nodes]
         else:
+            assert pp_size == len(
+                ray_nodes
+            ), f"Cannot use PP across hosts, please set --pipeline-parallel-size to 1 or {len(ray_nodes)}"
             num_devices_per_pp_rank = self.vllm_config.sharding_config.total_devices
             placement_group_specs = [{
                 device_str: num_devices_per_pp_rank

tpu_inference/experimental/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/experimental/llama3_jax_stashed.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 # TODO: Update documentation
 from typing import List, Optional, Tuple

tpu_inference/kernels/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/collectives/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/collectives/all_gather_matmul.py CHANGED Viewed

@@ -540,12 +540,16 @@ def get_vmem_estimate_bytes(
     """Returns the total vmem bytes used by the kernel."""
     m_per_device = m // tp_size
     n_per_device = n // tp_size
-    y_vmem_bytes = n_per_device * k * dtypes.bit_width(y_dtype) // 8
+    y_vmem_bytes = (n_per_device * k * (dtypes.bit_width(y_dtype) if hasattr(
+        dtypes, "bit_width") else dtypes.itemsize_bits(y_dtype)) // 8)
     total_bytes = (
-        2 * m_per_device * k * dtypes.bit_width(x_dtype) //
-        8  # x_vmem_scratch_ref
+        2 * m_per_device * k *
+        (dtypes.bit_width(x_dtype) if hasattr(dtypes, "bit_width") else
+         dtypes.itemsize_bits(x_dtype)) // 8  # x_vmem_scratch_ref
         + y_vmem_bytes  # y_vmem_scratch_ref
-        + 2 * m * bn * dtypes.bit_width(out_dtype) // 8  # o_vmem_scratch_ref
+        + 2 * m * bn *
+        (dtypes.bit_width(out_dtype) if hasattr(dtypes, "bit_width") else
+         dtypes.itemsize_bits(out_dtype)) // 8  # o_vmem_scratch_ref
         + acc_bytes  # acc_vmem_scratch_ref, jnp.float32
     )
     return total_bytes
@@ -639,8 +643,10 @@ def all_gather_matmul(
     # NOTE(chengjiyao): acc buffer is not used in the grid_k == 1 case.
     if grid_k == 1:
         acc_shape = (8, 128)
-    acc_bytes = acc_shape[0] * acc_shape[1] * dtypes.bit_width(
-        jnp.float32) // 8
+    acc_bytes = (
+        acc_shape[0] *
+        acc_shape[1] * (dtypes.bit_width(jnp.float32) if hasattr(
+            dtypes, "bit_width") else dtypes.itemsize_bits(jnp.float32)) // 8)
     y_vmem_shape = (n_per_device, k) if rhs_transpose else (k, n_per_device)
     estimated_vmem_bytes = get_vmem_estimate_bytes(
         m,

tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """All-gather matmul kernel's tuned block sizes."""
+import re
 import jax
 # key:
@@ -32,8 +34,11 @@ def get_tpu_version() -> int:
         return -1
     if kind.endswith(' lite'):
         kind = kind[:-len(' lite')]
-    assert kind[:-1] == 'TPU v', kind
-    return int(kind[-1])
+    # v6: "TPU v6"
+    # v7: "TPU7x"
+    assert kind[:3] == 'TPU', kind
+    return int(re.search(r'\d+', kind).group())
 def get_key(

tpu_inference/kernels/flash_attention/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/fused_moe/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/fused_moe/v1/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl