PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (56) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_envs.py +11 -32
tests/test_utils.py +2 -1
tpu_inference/__init__.py +3 -22
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +2 -9
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +145 -266
tpu_inference/layers/common/attention_interface.py +1 -7
tpu_inference/layers/common/sharding.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/mxfp4.py +73 -138
tpu_inference/layers/vllm/quantization/unquantized.py +64 -58
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +10 -43
tpu_inference/models/jax/llama3.py +1 -2
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +1 -2
tpu_inference/models/jax/qwen2_5_vl.py +48 -163
tpu_inference/models/jax/qwen3.py +1 -2
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +8 -14
tpu_inference/platforms/tpu_platform.py +31 -37
tpu_inference/runner/compilation_manager.py +58 -141
tpu_inference/runner/kv_cache.py +1 -1
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +147 -271
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +13 -36
tpu_inference/worker/tpu_worker.py +25 -162
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/RECORD +55 -50
tpu_inference/models/jax/llama_guard_4.py +0 -361
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/top_level.txt +0 -0

tpu_inference/models/jax/utils/weight_utils.py CHANGED Viewed

@@ -13,14 +13,12 @@ from typing import Any, Optional
 import jax
 import jax.numpy as jnp
 import torch
-import torchax
 from flax import nnx
 from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
 from safetensors import safe_open
-from vllm.config import VllmConfig
-from tpu_inference import envs, utils
+from tpu_inference import utils
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils import file_utils
@@ -199,11 +197,12 @@ def shard_put(x: jax.Array, shardings, mesh: jax.sharding.Mesh) -> jax.Array:
         return jax.device_put(x, shardings)
-def get_default_maps(model_config, mesh: Mesh,
+def get_default_maps(vllm_config, mesh: Mesh,
                      name_map: dict[str, str]) -> MetadataMap:
     """Load weights from one model weights file to the model, run on single thread."""
     sharding_size = mesh.shape["model"]
+    model_config = vllm_config.model_config
     hf_config = model_config.hf_config
     num_heads = hf_config.num_attention_heads
@@ -267,15 +266,14 @@ def get_default_maps(model_config, mesh: Mesh,
                        bias_pad_map=bias_pad_keys)
-def _load_and_shard_weight(vllm_config,
-                           params: nnx.State,
-                           shardings: Any,
-                           metadata_map: MetadataMap,
-                           mesh: Mesh,
-                           hf_key: str,
-                           hf_weight: jax.Array,
-                           keep_original_dtype_keys_regex: list[str]
-                           | None = None):
+def _load_hf_weights_on_thread(vllm_config,
+                               params: nnx.State,
+                               metadata_map: MetadataMap,
+                               mesh: Mesh,
+                               weights_file: str,
+                               filter_regex: str | None = None,
+                               keep_original_dtype_keys_regex: list[str]
+                               | None = None):
     name_map = metadata_map.name_map
     reshape_keys = metadata_map.reshape_map
     bias_reshape_keys = metadata_map.bias_reshape_map
@@ -292,118 +290,6 @@ def _load_and_shard_weight(vllm_config,
     head_dim = utils.get_padded_head_dim(head_dim_original)
     head_dim_pad = head_dim - head_dim_original
-    # Check if the key should retain its original dtype
-    keep_original_dtype = False
-    if keep_original_dtype_keys_regex:
-        for pattern in keep_original_dtype_keys_regex:
-            if re.match(pattern, hf_key):
-                keep_original_dtype = True
-                break
-    # Converting to config's dtype
-    if not keep_original_dtype and hf_weight.dtype != model_config.dtype:
-        logger.warning(
-            f"Converting dtype for {hf_key} from {hf_weight.dtype} to {model_config.dtype}"
-        )
-        hf_weight = hf_weight.astype(model_config.dtype)
-    if hf_key.endswith(".weight"):
-        hf_key = hf_key.removesuffix(".weight")
-    # Find the corresponding model key using the HF key
-    if "layers" in hf_key:
-        layer_num = re.search(r"layers\.(\d+)", hf_key).group(1)
-        layer_key = re.sub(r"layers\.\d+", "layers.*", hf_key)
-        model_key = name_map[layer_key]
-        model_key = re.sub(r"layers\.\*", f"layers.{layer_num}", model_key)
-    elif "blocks" in hf_key:
-        layer_num = re.search(r"blocks\.(\d+)", hf_key).group(1)
-        layer_key = re.sub(r"blocks\.\d+", "blocks.*", hf_key)
-        model_key = name_map[layer_key]
-        model_key = re.sub(r"blocks\.\*", f"blocks.{layer_num}", model_key)
-    else:
-        if hf_key not in name_map and hf_key == "lm_head":
-            logger.warning(f"Skip loading {hf_key} due to tie_word_embeddings")
-            return
-        if hf_key not in name_map and "t2d" in hf_key:
-            logger.warning(
-                f"Skip loading {hf_key} as it's not used in eagle-3 for now")
-            return
-        model_key = name_map.get(hf_key, hf_key)
-    model_weight, model_sharding = get_param_and_sharding(
-        params, shardings, model_key)
-    logger.debug(
-        "before transform | "
-        f"{hf_key}: {hf_weight.shape} --> {model_key}: {model_weight.value.shape} {model_sharding}"
-    )
-    if hf_key.endswith(".bias"):
-        for key in bias_reshape_keys:
-            if key in hf_key:
-                hf_weight = jnp.reshape(hf_weight, bias_reshape_keys[key])
-                if head_dim_pad > 0:
-                    hf_weight = jnp.pad(hf_weight, ((0, 0), (0, head_dim_pad)))
-                break
-    else:
-        for key in reshape_keys:
-            if key in hf_key:
-                hf_weight = jnp.reshape(hf_weight, reshape_keys[key])
-                if head_dim_pad > 0:
-                    if "o_proj" in key:
-                        hf_weight = jnp.pad(hf_weight, ((0, 0), (0, 0),
-                                                        (0, head_dim_pad)))
-                    else:
-                        hf_weight = jnp.pad(hf_weight,
-                                            ((0, 0), (0, head_dim_pad),
-                                             (0, 0)))
-                break
-        for key in transpose_keys:
-            if key in hf_key:
-                hf_weight = jnp.transpose(hf_weight, transpose_keys[key])
-                break
-    # Pad num-kv-heads
-    if hf_key.endswith(".bias"):
-        for key, value in bias_pad_keys.items():
-            dim = value[0]
-            dim_size = value[1]
-            if key in hf_key and dim_size != 0:
-                hf_weight = jnp.repeat(hf_weight, dim_size, axis=dim)
-                break
-    else:
-        for key, value in pad_keys.items():
-            dim = value[0]
-            dim_size = value[1]
-            if key in hf_key and dim_size != 0:
-                hf_weight = jnp.repeat(hf_weight, dim_size, axis=dim)
-                break
-    logger.debug(
-        "after transform | "
-        f"{hf_key}: {hf_weight.shape} --> {model_key}: {model_weight.value.shape} {model_sharding}"
-    )
-    if head_dim_pad == 0:
-        assert model_weight.value.shape == hf_weight.shape, f"{hf_key}: {model_weight.value.shape} != {hf_weight.shape}"
-    # Update the model weight
-    spec = model_weight.sharding.spec if isinstance(
-        model_weight.sharding, NamedSharding) else model_weight.sharding
-    model_weight.value = shard(hf_weight, spec)
-def _load_hf_weights_on_thread(
-    vllm_config: VllmConfig,
-    params: nnx.State,
-    metadata_map: "MetadataMap",
-    mesh: Mesh,
-    weights_file: str,
-    filter_regex: Optional[str] = None,
-    keep_original_dtype_keys_regex: Optional[list[str]] = None,
-):
-    """Loads weights from a single weights file."""
     try:
         shardings = nnx.get_named_sharding(params, mesh)
     except TypeError:
@@ -411,88 +297,147 @@ def _load_hf_weights_on_thread(
     for hf_key, hf_weight in model_weights_single_file_generator(
             weights_file, framework="flax", filter_regex=filter_regex):
-        _load_and_shard_weight(
-            vllm_config,
-            params,
-            shardings,
-            metadata_map,
-            mesh,
-            hf_key,
-            hf_weight,
-            keep_original_dtype_keys_regex,
-        )
+        # Check if the key should retain its original dtype
+        keep_original_dtype = False
+        if keep_original_dtype_keys_regex:
+            for pattern in keep_original_dtype_keys_regex:
+                if re.match(pattern, hf_key):
+                    keep_original_dtype = True
+                    break
-def load_hf_weights(
-    vllm_config: VllmConfig,
-    model: nnx.Module,
-    metadata_map: "MetadataMap",
-    mesh: Mesh,
-    filter_regex: Optional[str] = None,
-    is_draft_model: bool = False,
-    keep_original_dtype_keys_regex: Optional[list[str]] = None,
-):
-    """Load weights into a JAX model from either an iterator or files."""
-    params = nnx.state(model)
-    try:
-        shardings = nnx.get_named_sharding(params, mesh)
-    except TypeError:
-        shardings = params
-    weights_iterator = None
-    if hasattr(vllm_config.model_config, "model_weights_iterator"):
-        weights_iterator = vllm_config.model_config.model_weights_iterator
-    env = torchax.default_env()
-    # The weights_iterator is used in RunAI model streamer integration.
-    if weights_iterator is not None:
-        for hf_key, hf_weight in weights_iterator:
-            if filter_regex and not re.match(filter_regex, hf_key):
+        # Converting to config's dtype
+        if not keep_original_dtype and hf_weight.dtype != model_config.dtype:
+            logger.warning(
+                f"Converting dtype for {hf_key} from {hf_weight.dtype} to {model_config.dtype}"
+            )
+            hf_weight = hf_weight.astype(model_config.dtype)
+        if hf_key.endswith(".weight"):
+            hf_key = hf_key.removesuffix(".weight")
+        # Find the corresponding model key using the HF key
+        if "layers" in hf_key:
+            layer_num = re.search(r"layers\.(\d+)", hf_key).group(1)
+            layer_key = re.sub(r"layers\.\d+", "layers.*", hf_key)
+            model_key = name_map[layer_key]
+            model_key = re.sub(r"layers\.\*", f"layers.{layer_num}", model_key)
+        elif "blocks" in hf_key:
+            layer_num = re.search(r"blocks\.(\d+)", hf_key).group(1)
+            layer_key = re.sub(r"blocks\.\d+", "blocks.*", hf_key)
+            model_key = name_map[layer_key]
+            model_key = re.sub(r"blocks\.\*", f"blocks.{layer_num}", model_key)
+        else:
+            if hf_key not in name_map and hf_key == "lm_head":
+                logger.warning(
+                    f"Skip loading {hf_key} due to tie_word_embeddings")
+                continue
+            if hf_key not in name_map and "t2d" in hf_key:
+                logger.warning(
+                    f"Skip loading {hf_key} as it's not used in eagle-3 for now"
+                )
                 continue
+            model_key = name_map.get(hf_key, hf_key)
+        model_weight, model_sharding = get_param_and_sharding(
+            params, shardings, model_key)
-            # Since the weights_iterator yields Pytorch tensors (torch.Tensor),
-            # we need to convert them to JAX arrays (jax.Array).
-            hf_weight_jax = env.t2j_copy(hf_weight)
+        logger.debug(
+            "before transform | "
+            f"{hf_key}: {hf_weight.shape}  -->  {model_key}: {model_weight.value.shape} {model_sharding}"
+        )
+        if hf_key.endswith(".bias"):
+            for key in bias_reshape_keys:
+                if key in hf_key:
+                    hf_weight = jnp.reshape(hf_weight, bias_reshape_keys[key])
+                    if head_dim_pad > 0:
+                        hf_weight = jnp.pad(hf_weight,
+                                            ((0, 0), (0, head_dim_pad)))
+                    break
+        else:
+            for key in reshape_keys:
+                if key in hf_key:
+                    hf_weight = jnp.reshape(hf_weight, reshape_keys[key])
+                    if head_dim_pad > 0:
+                        if "o_proj" in key:
+                            hf_weight = jnp.pad(hf_weight, ((0, 0), (0, 0),
+                                                            (0, head_dim_pad)))
+                        else:
+                            hf_weight = jnp.pad(hf_weight,
+                                                ((0, 0), (0, head_dim_pad),
+                                                 (0, 0)))
+                    break
+            for key in transpose_keys:
+                if key in hf_key:
+                    hf_weight = jnp.transpose(hf_weight, transpose_keys[key])
+                    break
+        # Pad num-kv-heads
+        if hf_key.endswith(".bias"):
+            for key, value in bias_pad_keys.items():
+                dim = value[0]
+                dim_size = value[1]
+                if key in hf_key and dim_size != 0:
+                    hf_weight = jnp.repeat(hf_weight, dim_size, axis=dim)
+                    break
+        else:
+            for key, value in pad_keys.items():
+                dim = value[0]
+                dim_size = value[1]
+                if key in hf_key and dim_size != 0:
+                    hf_weight = jnp.repeat(hf_weight, dim_size, axis=dim)
+                    break
+        logger.debug(
+            "after transform | "
+            f"{hf_key}: {hf_weight.shape}  -->  {model_key}: {model_weight.value.shape} {model_sharding}"
+        )
-            _load_and_shard_weight(
+        if head_dim_pad == 0:
+            assert model_weight.value.shape == hf_weight.shape, f"{hf_key}: {model_weight.value.shape} != {hf_weight.shape}"
+        # Update the model weight
+        spec = model_weight.sharding.spec if isinstance(
+            model_weight.sharding, NamedSharding) else model_weight.sharding
+        model_weight.value = shard(hf_weight, spec)
+def load_hf_weights(vllm_config,
+                    model: nnx.Module,
+                    metadata_map: MetadataMap,
+                    mesh: Mesh,
+                    filter_regex: str | None = None,
+                    is_draft_model: bool = False,
+                    keep_original_dtype_keys_regex: list[str] | None = None):
+    """Load weights from all model weights files to the model, run in multi threads."""
+    if is_draft_model:
+        model_path = vllm_config.speculative_config.draft_model_config.model
+    else:
+        model_path = vllm_config.model_config.model
+    weights_files = get_model_weights_files(
+        model_path, vllm_config.load_config.download_dir)
+    params = nnx.state(model)
+    max_workers = min(64, len(weights_files))
+    # NOTE(xiang): Disable multi-threading mode if running on multi-host.
+    # Because multi-threading would cause different JAX processes to load
+    # different weights at the same time.
+    if os.environ.get("TPU_MULTIHOST_BACKEND", "").lower() == "ray":
+        max_workers = 1
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(
+                _load_hf_weights_on_thread,
                 vllm_config,
                 params,
-                shardings,
                 metadata_map,
                 mesh,
-                hf_key,
-                hf_weight_jax,
-                keep_original_dtype_keys_regex,
-            )
-    else:
-        # File-based path (multi-threaded)
-        if is_draft_model:
-            model_path = vllm_config.speculative_config.draft_model_config.model
-        else:
-            model_path = vllm_config.model_config.model
-        weights_files = get_model_weights_files(
-            model_path, vllm_config.load_config.download_dir)
-        max_workers = min(64, len(weights_files))
-        # NOTE(xiang): Disable multi-threading mode if running on multi-host.
-        # Because multi-threading would cause different JAX processes to load
-        # different weights at the same time.
-        if envs.TPU_MULTIHOST_BACKEND == "ray":
-            max_workers = 1
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            futures = [
-                executor.submit(
-                    _load_hf_weights_on_thread,
-                    vllm_config,
-                    params,
-                    metadata_map,
-                    mesh,
-                    weights_file,
-                    filter_regex=filter_regex,
-                    keep_original_dtype_keys_regex=
-                    keep_original_dtype_keys_regex,
-                ) for weights_file in weights_files
-            ]
-            for future in futures:
-                future.result()
+                weights_file,
+                filter_regex=filter_regex,
+                keep_original_dtype_keys_regex=keep_original_dtype_keys_regex)
+            for weights_file in weights_files
+        ]
+        for future in futures:
+            future.result()
     check_all_loaded(params)
     nnx.update(model, params)

tpu_inference/models/vllm/vllm_model_wrapper.py CHANGED Viewed

@@ -9,7 +9,6 @@ import jax
 import torch
 import torch.nn
 import torchax
-import vllm.envs as vllm_envs
 from flax.typing import PRNGKey
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from torchax.interop import jax_view, torch_view
@@ -119,16 +118,10 @@ class VllmModelWrapper:
             "torch._sync",
             return_value=None) if use_random_weights else nullcontext()
-        # By default load weights to the CPU device first. If we are running
-        # under Pathways, this would cause weights to be loaded on a CPU-only
-        # node, so we'll need to remove this context.
-        jax_context = jax.default_device(
-            jax.devices("cpu")
-            [0]) if not vllm_envs.VLLM_TPU_USING_PATHWAYS else nullcontext()
         # Load the vLLM model and wrap it into a new model whose forward
         # function can calculate the hidden_state and logits.
-        with load_context, jax_context:
+        available_devices = self.mesh.devices.flatten()
+        with load_context, jax.default_device(available_devices[0]):
             vllm_model = vllm_get_model(vllm_config=vllm_config_for_load)
         lora_manager = None
         if vllm_config_for_load.lora_config is not None:
@@ -169,7 +162,6 @@ class VllmModelWrapper:
             input_ids: jax.Array,
             attn_metadata: AttentionMetadata,
             input_embeds: jax.Array,
-            input_positions: jax.Array,
             layer_name_to_kvcache_index: Sequence[Tuple[str, int]],
             lora_metadata,
             intermediate_tensors: JaxIntermediateTensors = None,
@@ -196,8 +188,8 @@ class VllmModelWrapper:
                     torch_view(params_and_buffers),
                     kwargs={
                         "input_ids": torch_view(input_ids),
-                        "positions": torch_view(input_positions),
-                        "intermediate_tensors": None,
+                        "positions": torch_view(attn_metadata.input_positions),
+                        "intermediate_tensors": intermediate_tensors,
                         "inputs_embeds": None,
                     },
                     tie_weights=False,
@@ -221,7 +213,7 @@ class VllmModelWrapper:
         @functools.partial(
             jax.jit,
             out_shardings=(NamedSharding(self.mesh,
-                                         PartitionSpec("data", "model"))),
+                                         PartitionSpec(None, "model"))),
         )
         def compute_logits_func(
             params_and_buffers: Any,
@@ -263,6 +255,7 @@ def load_lora_model(model: torch.nn.Module, vllm_config: VllmConfig,
         vllm_config,
         device,
         model.embedding_modules,
+        model.embedding_padding_modules,
     )
     return lora_manager, lora_manager.create_lora_manager(model)
@@ -276,9 +269,10 @@ def replace_set_lora(model):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
+        embeddings_tensor: Optional[torch.Tensor],
     ):
         with torchax.default_env():
-            self._original_set_lora(index, lora_a, lora_b)
+            self._original_set_lora(index, lora_a, lora_b, embeddings_tensor)
     def _tpu_reset_lora(self, index: int):
         with torchax.default_env():

tpu_inference/platforms/tpu_platform.py CHANGED Viewed

@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+import os
 from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
 import jax.numpy as jnp
-import torch
 import vllm.envs as vllm_envs
+from torchax.ops.mappings import j2t_dtype
 from tpu_info import device
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.platforms.interface import Platform, PlatformEnum
@@ -13,10 +14,9 @@ from vllm.sampling_params import SamplingParams, SamplingType
 from tpu_inference import envs
 from tpu_inference.layers.common.sharding import ShardingConfigManager
 from tpu_inference.logger import init_logger
-from tpu_inference.utils import to_jax_dtype, to_torch_dtype
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.attention.backends.registry import _Backend
     from vllm.config import BlockSize, ModelConfig, VllmConfig
     from vllm.pooling_params import PoolingParams
 else:
@@ -24,10 +24,16 @@ else:
     ModelConfig = None
     VllmConfig = None
     PoolingParams = None
-    AttentionBackendEnum = None
+    _Backend = None
 logger = init_logger(__name__)
+_DTYPE: dict[str, jnp.dtype] = {
+    "bfloat16": jnp.bfloat16,
+    "float": jnp.float32,
+    "float32": jnp.float32,
+}
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
@@ -48,13 +54,13 @@ class TpuPlatform(Platform):
     ]
     @classmethod
-    def get_attn_backend_cls(cls, selected_backend: "AttentionBackendEnum",
-                             head_size: int, dtype: jnp.dtype,
-                             kv_cache_dtype: Optional[str], block_size: int,
-                             use_v1: bool, use_mla: bool, has_sink: bool,
-                             use_sparse: bool, attn_type: Any) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-        if selected_backend != AttentionBackendEnum.PALLAS:
+    def get_attn_backend_cls(cls, selected_backend: "_Backend", head_size: int,
+                             dtype: jnp.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool, use_sparse: bool,
+                             attn_type: Any) -> str:
+        from vllm.attention.backends.registry import _Backend
+        if selected_backend != _Backend.PALLAS:
             logger.info("Cannot use %s backend on TPU.", selected_backend)
         if use_v1:
@@ -77,14 +83,6 @@ class TpuPlatform(Platform):
             logger.warning(f"Error getting device name: {e}")
             return 'TPU'
-    @classmethod
-    def fp8_dtype(cls) -> torch.dtype:
-        if cls.get_device_name().lower() == "tpu v6e":
-            logger.info(
-                "Automatically using fp8_e5m2 for FP8 KV cache on TPU v6e.")
-            return torch.float8_e5m2
-        return torch.float8_e4m3fn
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         raise NotImplementedError
@@ -135,7 +133,6 @@ class TpuPlatform(Platform):
         # For v0, the default block size is 16.
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = cast(BlockSize, 16)
         compilation_config = vllm_config.compilation_config
         # TPU only supports DYNAMO_TRACE_ONCE compilation level
@@ -152,19 +149,20 @@ class TpuPlatform(Platform):
         # NOTE(xiang): convert dtype to jnp.dtype
         # NOTE(wenlong): skip this logic for mm model preprocessing
         # For mm model preprocessors, it may need the output dtype to be torch.
-        # In order to avoid a PR to vLLM, we postpone the dtype checking during
-        # tpu_worker initialization
+        # In order to avoid a PR to vLLM, we postpone the dtype checking during tpu_worker initialization
         if not vllm_config.scheduler_config.is_multimodal_model or impl == "vllm":
-            model_dtype = vllm_config.model_config.dtype
-            try:
-                dtype = to_jax_dtype(model_dtype)
-            except ValueError:
-                logger.warning(f"{model_dtype=} is not supported. "
-                               "Falling back to jnp.bfloat16")
-                dtype = jnp.bfloat16
-            if impl == "vllm":
-                dtype = to_torch_dtype(dtype)
-            vllm_config.model_config.dtype = dtype
+            if not isinstance(vllm_config.model_config.dtype, str):
+                logger.warning(
+                    "The model dtype is not properly set for JAX backend. "
+                    "Overwriting it to jnp.bfloat16")
+                vllm_config.model_config.dtype = jnp.bfloat16
+            else:
+                vllm_config.model_config.dtype = _DTYPE.get(
+                    vllm_config.model_config.dtype, jnp.bfloat16)
+        if impl == "vllm":
+            vllm_config.model_config.dtype = j2t_dtype(
+                vllm_config.model_config.dtype.dtype)
         # TODO(cuiq): remove this dependency.
         from vllm.v1.attention.backends.pallas import PallasAttentionBackend
@@ -185,7 +183,7 @@ class TpuPlatform(Platform):
         parallel_config.worker_cls = \
                         "tpu_inference.worker.tpu_worker.TPUWorker"
-        multihost_backend = envs.TPU_MULTIHOST_BACKEND
+        multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
         if not multihost_backend:  # Single host
             if parallel_config.pipeline_parallel_size == 1:
                 logger.info("Force using UniProcExecutor for JAX on \
@@ -269,7 +267,3 @@ class TpuPlatform(Platform):
         Returns if the current platform needs to sync weight loader.
         """
         return True
-    @classmethod
-    def support_hybrid_kv_cache(cls) -> bool:
-        return True

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl