PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511270815__py3-none-any.whl → 0.11.1.dev202512030818__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511270815py3-none-any.whl → 0.11.1.dev202512030818py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (25) hide show

tests/test_envs.py CHANGED Viewed

@@ -56,6 +56,12 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
 def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
+    # Ensure clean environment for boolean vars by setting to default "0"
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "0")
+    monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "0")
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "0")
+    monkeypatch.setenv("USE_MOE_EP_KERNEL", "0")
     # Test SKIP_JAX_PRECOMPILE (default False)
     assert envs.SKIP_JAX_PRECOMPILE is False
     monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "1")
@@ -63,6 +69,13 @@ def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "0")
     assert envs.SKIP_JAX_PRECOMPILE is False
+    # Test VLLM_XLA_CHECK_RECOMPILATION (default False)
+    assert envs.VLLM_XLA_CHECK_RECOMPILATION is False
+    monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "1")
+    assert envs.VLLM_XLA_CHECK_RECOMPILATION is True
+    monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "0")
+    assert envs.VLLM_XLA_CHECK_RECOMPILATION is False
     # Test NEW_MODEL_DESIGN (default False)
     assert envs.NEW_MODEL_DESIGN is False
     monkeypatch.setenv("NEW_MODEL_DESIGN", "1")
@@ -75,20 +88,32 @@ def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
 def test_integer_env_vars(monkeypatch: pytest.MonkeyPatch):
+    # Ensure clean environment for integer vars by setting to defaults
+    monkeypatch.setenv("PYTHON_TRACER_LEVEL", "1")
+    monkeypatch.setenv("NUM_SLICES", "1")
     assert envs.PYTHON_TRACER_LEVEL == 1
     monkeypatch.setenv("PYTHON_TRACER_LEVEL", "3")
     assert envs.PYTHON_TRACER_LEVEL == 3
     monkeypatch.setenv("PYTHON_TRACER_LEVEL", "0")
     assert envs.PYTHON_TRACER_LEVEL == 0
+    # Test NUM_SLICES (default 1)
+    assert envs.NUM_SLICES == 1
+    monkeypatch.setenv("NUM_SLICES", "2")
+    assert envs.NUM_SLICES == 2
+    monkeypatch.setenv("NUM_SLICES", "4")
+    assert envs.NUM_SLICES == 4
-def test_lowercase_conversion(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "GRPC")
-    assert envs.TPU_MULTIHOST_BACKEND == "grpc"
-    monkeypatch.setenv("MODEL_IMPL_TYPE", "FLAX_NNX")
+def test_model_impl_type_choices(monkeypatch: pytest.MonkeyPatch):
+    # Test case sensitive choices
+    monkeypatch.setenv("MODEL_IMPL_TYPE", "flax_nnx")
     assert envs.MODEL_IMPL_TYPE == "flax_nnx"
+    monkeypatch.setenv("MODEL_IMPL_TYPE", "vllm")
+    assert envs.MODEL_IMPL_TYPE == "vllm"
 def test_string_env_vars_defaults(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.delenv("JAX_PLATFORMS", raising=False)
@@ -117,8 +142,6 @@ def test_ray_env_vars(monkeypatch: pytest.MonkeyPatch):
     assert envs.RAY_USAGE_STATS_ENABLED == "1"
     assert envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "shm"
-    monkeypatch.setenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "nccl")
-    assert envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl"
 def test_invalid_attribute_raises_error():
@@ -134,6 +157,7 @@ def test_dir_returns_all_env_vars():
     assert "JAX_PLATFORMS" in env_vars
     assert "TPU_NAME" in env_vars
     assert "SKIP_JAX_PRECOMPILE" in env_vars
+    assert "VLLM_XLA_CHECK_RECOMPILATION" in env_vars
     assert "MODEL_IMPL_TYPE" in env_vars
@@ -141,11 +165,8 @@ def test_tpu_multihost_env_vars(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("TPU_WORKER_ID", "0")
     assert envs.TPU_WORKER_ID == "0"
-    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "grpc")
-    assert envs.TPU_MULTIHOST_BACKEND == "grpc"
-    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "xla")
-    assert envs.TPU_MULTIHOST_BACKEND == "xla"
+    monkeypatch.setenv("TPU_MULTIHOST_BACKEND", "ray")
+    assert envs.TPU_MULTIHOST_BACKEND == "ray"
 def test_disaggregated_serving_env_vars(monkeypatch: pytest.MonkeyPatch):

tests/test_utils.py CHANGED Viewed

@@ -231,6 +231,5 @@ def test_get_jax_dtype_from_str_dtype():
     assert get_jax_dtype_from_str_dtype("int8") == jnp.int8
     assert get_jax_dtype_from_str_dtype("bfloat16") == jnp.bfloat16
     assert get_jax_dtype_from_str_dtype("fp8") == jnp.float8_e4m3fn
-    assert get_jax_dtype_from_str_dtype("fp8_e4m3") == jnp.float8_e4m3
+    assert get_jax_dtype_from_str_dtype("fp8_e4m3") == jnp.float8_e4m3fn
     assert get_jax_dtype_from_str_dtype("fp8_e5m2") == jnp.float8_e5m2
-    assert get_jax_dtype_from_str_dtype("auto") is None

tpu_inference/distributed/tpu_connector.py CHANGED Viewed

@@ -457,7 +457,6 @@ class TPUConnectorWorker:
         self.side_channel_port = get_side_channel_port()
         self.kv_transfer_server = None
-        self._maybe_start_p2p_server()
         self.zmq_cxt = zmq.Context()
         if self.is_producer:
             ready_event = threading.Event()
@@ -499,6 +498,7 @@ class TPUConnectorWorker:
         self.shape = list(kv_layer.shape)
         self.dtype = kv_layer.dtype
         self.sharding = kv_layer.sharding
+        self._maybe_start_p2p_server()
     def _maybe_start_p2p_server(self):
         if self.kv_transfer_server is not None:

tpu_inference/envs.py CHANGED Viewed

@@ -15,14 +15,60 @@ if TYPE_CHECKING:
     PREFILL_SLICES: str = ""
     DECODE_SLICES: str = ""
     SKIP_JAX_PRECOMPILE: bool = False
+    VLLM_XLA_CHECK_RECOMPILATION: bool = False
     MODEL_IMPL_TYPE: str = "flax_nnx"
     NEW_MODEL_DESIGN: bool = False
     PHASED_PROFILING_DIR: str = ""
     PYTHON_TRACER_LEVEL: int = 1
     USE_MOE_EP_KERNEL: bool = False
+    NUM_SLICES: int = 1
     RAY_USAGE_STATS_ENABLED: str = "0"
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "shm"
+def env_with_choices(
+    env_name: str,
+    default: str | None,
+    choices: list[str] | Callable[[], list[str]],
+    case_sensitive: bool = True,
+) -> Callable[[], str | None]:
+    """
+    Create a lambda that validates environment variable against allowed choices
+    Args:
+        env_name: Name of the environment variable
+        default: Default value if not set (can be None)
+        choices: List of valid string options or callable that returns list
+        case_sensitive: Whether validation should be case sensitive
+    Returns:
+        Lambda function for environment_variables dict
+    """
+    def _get_validated_env() -> str | None:
+        value = os.getenv(env_name)
+        if value is None:
+            return default
+        # Resolve choices if it's a callable (for lazy loading)
+        actual_choices = choices() if callable(choices) else choices
+        if not case_sensitive:
+            check_value = value.lower()
+            check_choices = [choice.lower() for choice in actual_choices]
+        else:
+            check_value = value
+            check_choices = actual_choices
+        if check_value not in check_choices:
+            raise ValueError(f"Invalid value '{value}' for {env_name}. "
+                             f"Valid options: {actual_choices}.")
+        return value
+    return _get_validated_env
 environment_variables: dict[str, Callable[[], Any]] = {
     # JAX platform selection (e.g., "tpu", "cpu", "proxy")
     "JAX_PLATFORMS":
@@ -38,7 +84,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("TPU_WORKER_ID", None),
     # Backend for multi-host communication on TPU
     "TPU_MULTIHOST_BACKEND":
-    lambda: os.getenv("TPU_MULTIHOST_BACKEND", "").lower(),
+    env_with_choices("TPU_MULTIHOST_BACKEND", "", ["ray"]),
     # Slice configuration for disaggregated prefill workers
     "PREFILL_SLICES":
     lambda: os.getenv("PREFILL_SLICES", ""),
@@ -47,28 +93,35 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("DECODE_SLICES", ""),
     # Skip JAX precompilation step during initialization
     "SKIP_JAX_PRECOMPILE":
-    lambda: bool(int(os.getenv("SKIP_JAX_PRECOMPILE", "0"))),
+    lambda: bool(int(os.getenv("SKIP_JAX_PRECOMPILE") or "0")),
+    # Check for XLA recompilation during execution
+    "VLLM_XLA_CHECK_RECOMPILATION":
+    lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION") or "0")),
     # Model implementation type (e.g., "flax_nnx")
     "MODEL_IMPL_TYPE":
-    lambda: os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower(),
+    env_with_choices("MODEL_IMPL_TYPE", "flax_nnx",
+                     ["vllm", "flax_nnx", "jetpack"]),
     # Enable new experimental model design
     "NEW_MODEL_DESIGN":
-    lambda: bool(int(os.getenv("NEW_MODEL_DESIGN", "0"))),
+    lambda: bool(int(os.getenv("NEW_MODEL_DESIGN") or "0")),
     # Directory to store phased profiling output
     "PHASED_PROFILING_DIR":
     lambda: os.getenv("PHASED_PROFILING_DIR", ""),
     # Python tracer level for profiling
     "PYTHON_TRACER_LEVEL":
-    lambda: int(os.getenv("PYTHON_TRACER_LEVEL", "1")),
+    lambda: int(os.getenv("PYTHON_TRACER_LEVEL") or "1"),
     # Use custom expert-parallel kernel for MoE (Mixture of Experts)
     "USE_MOE_EP_KERNEL":
-    lambda: bool(int(os.getenv("USE_MOE_EP_KERNEL", "0"))),
+    lambda: bool(int(os.getenv("USE_MOE_EP_KERNEL") or "0")),
+    # Number of TPU slices for multi-slice mesh
+    "NUM_SLICES":
+    lambda: int(os.getenv("NUM_SLICES") or "1"),
     # Enable/disable Ray usage statistics collection
     "RAY_USAGE_STATS_ENABLED":
     lambda: os.getenv("RAY_USAGE_STATS_ENABLED", "0"),
     # Ray compiled DAG channel type for TPU
     "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
-    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm"),
+    env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm", ["shm"]),
 }

tpu_inference/executors/ray_distributed_executor.py CHANGED Viewed

@@ -136,10 +136,14 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
         pp_size = self.parallel_config.pipeline_parallel_size
         placement_group_specs: List[Dict[str, float]] = []
+        ray_nodes = ray.nodes()
+        logger.info(f"RayDistributedExecutor | ray_nodes={ray_nodes}")
         if pp_size == 1:
             placement_group_specs = [{
                 device_str: node['Resources'][device_str]
-            } for node in ray.nodes()]
+            } for node in ray_nodes]
         else:
             num_devices_per_pp_rank = self.vllm_config.sharding_config.total_devices
             placement_group_specs = [{

tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py CHANGED Viewed

@@ -352,7 +352,7 @@ def _ragged_paged_attention_kernel(
     debug_print("[RPA debug] q_len={}", q_len)
     debug_print("[RPA debug] kv_len={}", kv_len)
-    def flash_attention(
+    def flash_attention_step1_qk_softmax(
         q,  # [actual_bq_sz * num_q_heads_per_kv_head, actual_head_dim_x2]
         kv,  # [bkv_sz, actual_head_dim_x2]
         *,
@@ -366,7 +366,6 @@ def _ragged_paged_attention_kernel(
         assert kv.shape == (bkv_sz, actual_head_dim_x2)
         head_l_ref = l_ref.at[kv_head_idx, :q.shape[0]]
         head_m_ref = m_ref.at[kv_head_idx, :q.shape[0]]
-        head_acc_ref = acc_ref.at[kv_head_idx, :q.shape[0]]
         def load_with_init(ref, init_val):
             return jnp.where(bkv_idx == bkv_idx_start,
@@ -416,15 +415,33 @@ def _ragged_paged_attention_kernel(
         head_m_ref[...] = m_curr
         p = jnp.exp(s - broadcast_minor(m_curr, s.shape))
-        pv = jnp.einsum("nm,md->nd", p, kv, preferred_element_type=jnp.float32)
-        if v_scale is not None:
-            pv *= v_scale
         p_rowsum = jnp.sum(p, axis=1, keepdims=True)
         exp_m_diff = jnp.exp(m_prev - m_curr)
         l_prev = load_with_init(head_l_ref, 1.0)
         l_curr = exp_m_diff * l_prev + p_rowsum
         head_l_ref[...] = l_curr
+        return p, exp_m_diff
+    def flash_attention_step2_pv(
+        q_shape_0,
+        kv,  # [bkv_sz, actual_head_dim_x2]
+        p,  # from step1
+        exp_m_diff,  # from step1
+        *,
+        bkv_idx,
+        kv_head_idx,
+    ):
+        head_acc_ref = acc_ref.at[kv_head_idx, :q_shape_0]
+        def load_with_init(ref, init_val):
+            return jnp.where(bkv_idx == bkv_idx_start,
+                             jnp.full_like(ref, init_val), ref[...])
+        pv = jnp.einsum("nm,md->nd", p, kv, preferred_element_type=jnp.float32)
+        if v_scale is not None:
+            pv *= v_scale
         o_prev = load_with_init(head_acc_ref, 0.0)
         o_curr = broadcast_minor(exp_m_diff, o_prev.shape) * o_prev + pv
         head_acc_ref[...] = o_curr
@@ -835,6 +852,11 @@ def _ragged_paged_attention_kernel(
                     return
                 # Flash attention with cur bkv and bq
+                prev_bq_shape_0 = None
+                prev_kv_head_bkv = None
+                prev_kv_head_idx = None
+                prev_kv_head_p = None
+                prev_kv_head_exp_m_diff = None
                 for kv_head_start in range(0, actual_num_kv_heads, kv_packing):
                     bkv_lst = strided_load_bkv(
                         bkv_sem_idx,
@@ -844,20 +866,51 @@ def _ragged_paged_attention_kernel(
                     )
                     assert len(bkv_lst) == kv_packing
                     for i in range(kv_packing):
-                        kv_head_idx = kv_head_start + i
-                        if kv_head_idx >= actual_num_kv_heads:
+                        cur_kv_head_idx = kv_head_start + i
+                        if cur_kv_head_idx >= actual_num_kv_heads:
                             break
-                        bq = load_bq(bq_sem_idx,
-                                     kv_head_idx,
-                                     actual_bq_sz=actual_bq_sz)
-                        bkv = bkv_lst[i]
-                        flash_attention(
-                            bq,
-                            bkv,
-                            bq_idx=bq_idx,
-                            bkv_idx=bkv_idx,
-                            kv_head_idx=kv_head_idx,
-                        )
+                        cur_kv_head_bq = load_bq(bq_sem_idx,
+                                                 cur_kv_head_idx,
+                                                 actual_bq_sz=actual_bq_sz)
+                        cur_kv_head__bkv = bkv_lst[i]
+                        # FlashAttention is divided into `flash_attention_step1_qk_softmax`
+                        # and `flash_attention_step2_pv` to pipeline the computation.
+                        # `step2_pv` for the previous KV head, which depends on the softmax
+                        # output, is overlapped with `step1_qk_softmax` for the current KV
+                        # head, reducing overall wait times.
+                        cur_kv_head_p, cur_kv_head_exp_m_diff = (
+                            flash_attention_step1_qk_softmax(
+                                cur_kv_head_bq,
+                                cur_kv_head__bkv,
+                                bq_idx=bq_idx,
+                                bkv_idx=bkv_idx,
+                                kv_head_idx=cur_kv_head_idx,
+                            ))
+                        if prev_bq_shape_0 is not None:
+                            flash_attention_step2_pv(
+                                prev_bq_shape_0,
+                                prev_kv_head_bkv,
+                                prev_kv_head_p,
+                                prev_kv_head_exp_m_diff,
+                                bkv_idx=bkv_idx,
+                                kv_head_idx=prev_kv_head_idx,
+                            )
+                        prev_bq_shape_0 = cur_kv_head_bq.shape[0]
+                        prev_kv_head_bkv = cur_kv_head__bkv
+                        prev_kv_head_p = cur_kv_head_p
+                        prev_kv_head_exp_m_diff = cur_kv_head_exp_m_diff
+                        prev_kv_head_idx = cur_kv_head_idx
+                # Execute pv of last attention head.
+                assert prev_bq_shape_0 is not None
+                flash_attention_step2_pv(
+                    prev_bq_shape_0,
+                    prev_kv_head_bkv,
+                    prev_kv_head_p,
+                    prev_kv_head_exp_m_diff,
+                    bkv_idx=bkv_idx,
+                    kv_head_idx=prev_kv_head_idx,
+                )
             lax.fori_loop(bkv_idx_start,
                           num_bkv,

tpu_inference/layers/common/sharding.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import math
-import os
 from dataclasses import asdict, dataclass
 from typing import TYPE_CHECKING, List, Optional
@@ -8,7 +7,7 @@ import jax.numpy as jnp
 import numpy as np
 from jax.sharding import Mesh
-from tpu_inference import utils
+from tpu_inference import envs, utils
 if TYPE_CHECKING:
     from vllm.v1.configs.vllm_config import VllmConfig
@@ -48,7 +47,7 @@ class ShardingAxisName2D:
 try:
-    _use_base_sharding = os.getenv("NEW_MODEL_DESIGN", False)
+    _use_base_sharding = envs.NEW_MODEL_DESIGN
     if _use_base_sharding:
         ShardingAxisName = ShardingAxisNameBase
     else:
@@ -167,7 +166,7 @@ class ShardingConfigManager:
                     f"(DP size: {total_dp_size}). Please disable LoRA or "
                     f"set data parallelism to 1.")
         if sharding_strategy.attention_data_parallelism > 1:
-            if not os.environ.get("NEW_MODEL_DESIGN", False):
+            if not envs.NEW_MODEL_DESIGN:
                 raise ValueError(
                     "Must run Attention DP with NEW_MODEL_DESIGN enabled. Please set the "
                     "NEW_MODEL_DESIGN=True.")

tpu_inference/layers/vllm/quantization/mxfp4.py CHANGED Viewed

@@ -95,7 +95,8 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
                 "UnquantizedLinearMethod.")
             return VllmUnquantizedLinearMethod(linear_config)
         elif isinstance(layer, FusedMoE):
-            return VllmMxfp4MoEMethod(layer.moe_config, self.mesh)
+            moe_config = self.get_moe_config(layer)
+            return VllmMxfp4MoEMethod(moe_config, self.mesh)
         elif isinstance(layer, Attention):
             # TODO: Add support for MXFP4 Attention.
             logger.warning_once("MXFP4 attention layer is not implemented. "

tpu_inference/models/common/model_loader.py CHANGED Viewed

@@ -236,7 +236,9 @@ def get_flax_model(
             hidden_states_sharding,  # aux hidden states
         ),
         donate_argnums=2,  # 0 is graphdef, 1 is state, 2 is kv_cache
-        static_argnums=7,  #7 is layer_name_to_kvcache_index
+        static_argnums=(
+            7, 10, 11
+        ),  #7 is layer_name_to_kvcache_index, 10 is is_first_rank, 11 is is_last_rank
     )
     def run_model(graphdef, state, *args):
         model = nnx.merge(graphdef, state)

tpu_inference/models/jax/utils/quantization/quantization_utils.py CHANGED Viewed

@@ -154,12 +154,9 @@ def qwix_quantize_nnx_model(model: nnx.Module, qwix_config: List[dict],
     logger.info(f"Memory usage before applying quantization of params: "
                 f"hbm={utils.hbm_usage_gb(jax.local_devices())}Gb")
-    # TODO (jacobplatin): we should refactor this to pass a dtype (or config) directly
-    kv_cache_jnp_dtype = utils.get_jax_dtype_from_str_dtype(kv_cache_dtype)
-    # Handle the case where kv_cache_dtype is "auto"
-    if kv_cache_jnp_dtype is None:
-        assert kv_cache_dtype == "auto", "kv_cache_dtype must be 'auto' if kv_cache_jnp_dtype is None"
+    if kv_cache_dtype != "auto":
+        kv_cache_jnp_dtype = utils.to_jax_dtype(kv_cache_dtype)
+    else:
         kv_cache_jnp_dtype = DEFAULT_KV_CACHE_DTYPE
     kv_caches = create_kv_caches(

tpu_inference/models/vllm/vllm_model_wrapper.py CHANGED Viewed

@@ -221,7 +221,7 @@ class VllmModelWrapper:
         @functools.partial(
             jax.jit,
             out_shardings=(NamedSharding(self.mesh,
-                                         PartitionSpec(None, "model"))),
+                                         PartitionSpec("data", "model"))),
         )
         def compute_logits_func(
             params_and_buffers: Any,
@@ -263,7 +263,6 @@ def load_lora_model(model: torch.nn.Module, vllm_config: VllmConfig,
         vllm_config,
         device,
         model.embedding_modules,
-        model.embedding_padding_modules,
     )
     return lora_manager, lora_manager.create_lora_manager(model)

tpu_inference/platforms/tpu_platform.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
 import jax.numpy as jnp
 import torch
 import vllm.envs as vllm_envs
-from torchax.ops.mappings import j2t_dtype
 from tpu_info import device
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.platforms.interface import Platform, PlatformEnum
@@ -14,6 +13,7 @@ from vllm.sampling_params import SamplingParams, SamplingType
 from tpu_inference import envs
 from tpu_inference.layers.common.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
+from tpu_inference.utils import to_jax_dtype, to_torch_dtype
 if TYPE_CHECKING:
     from vllm.attention.backends.registry import _Backend
@@ -28,12 +28,6 @@ else:
 logger = init_logger(__name__)
-_DTYPE: dict[str, jnp.dtype] = {
-    "bfloat16": jnp.bfloat16,
-    "float": jnp.float32,
-    "float32": jnp.float32,
-}
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
@@ -158,20 +152,19 @@ class TpuPlatform(Platform):
         # NOTE(xiang): convert dtype to jnp.dtype
         # NOTE(wenlong): skip this logic for mm model preprocessing
         # For mm model preprocessors, it may need the output dtype to be torch.
-        # In order to avoid a PR to vLLM, we postpone the dtype checking during tpu_worker initialization
+        # In order to avoid a PR to vLLM, we postpone the dtype checking during
+        # tpu_worker initialization
         if not vllm_config.scheduler_config.is_multimodal_model or impl == "vllm":
-            if not isinstance(vllm_config.model_config.dtype, str):
-                logger.warning(
-                    "The model dtype is not properly set for JAX backend. "
-                    "Overwriting it to jnp.bfloat16")
-                vllm_config.model_config.dtype = jnp.bfloat16
-            else:
-                vllm_config.model_config.dtype = _DTYPE.get(
-                    vllm_config.model_config.dtype, jnp.bfloat16)
-        if impl == "vllm":
-            vllm_config.model_config.dtype = j2t_dtype(
-                vllm_config.model_config.dtype.dtype)
+            model_dtype = vllm_config.model_config.dtype
+            try:
+                dtype = to_jax_dtype(model_dtype)
+            except ValueError:
+                logger.warning(f"{model_dtype=} is not supported. "
+                               "Falling back to jnp.bfloat16")
+                dtype = jnp.bfloat16
+            if impl == "vllm":
+                dtype = to_torch_dtype(dtype)
+            vllm_config.model_config.dtype = dtype
         # TODO(cuiq): remove this dependency.
         from vllm.v1.attention.backends.pallas import PallasAttentionBackend

tpu-inference 0.11.1.dev202511270815__py3-none-any.whl → 0.11.1.dev202512030818__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511270815py3-none-any.whl → 0.11.1.dev202512030818py3-none-any.whl