PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (67) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_utils.py +16 -24
tpu_inference/__init__.py +3 -22
tpu_inference/core/core_tpu.py +9 -17
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +11 -31
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +143 -287
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -7
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/{common → jax}/attention_interface.py +2 -8
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/{common → jax}/sharding.py +5 -5
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/__init__.py +3 -7
tpu_inference/layers/vllm/quantization/awq.py +3 -4
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +2 -4
tpu_inference/layers/vllm/quantization/unquantized.py +67 -62
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +12 -46
tpu_inference/models/jax/llama3.py +3 -4
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +2 -3
tpu_inference/models/jax/qwen2_5_vl.py +50 -165
tpu_inference/models/jax/qwen3.py +2 -3
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -32
tpu_inference/platforms/tpu_platform.py +34 -47
tpu_inference/runner/compilation_manager.py +60 -145
tpu_inference/runner/kv_cache.py +2 -2
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +135 -283
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +15 -38
tpu_inference/worker/tpu_worker.py +26 -163
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/RECORD +63 -61
tests/test_envs.py +0 -203
tpu_inference/layers/common/quant_methods.py +0 -8
tpu_inference/layers/vllm/quantization/mxfp4.py +0 -331
tpu_inference/models/jax/llama_guard_4.py +0 -361
/tpu_inference/layers/{common → jax}/binary_search.py +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/top_level.txt +0 -0

tpu_inference/mock/vllm_logger.py ADDED Viewed

@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Logging configuration for vLLM."""
+import datetime
+import json
+import logging
+import sys
+from collections.abc import Hashable
+from functools import lru_cache, partial
+from logging import Logger
+from logging.config import dictConfig
+from os import path
+from types import MethodType
+from typing import Any, cast
+import tpu_inference.mock.vllm_envs as envs
+VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
+VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
+VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+           "[%(filename)s:%(lineno)d] %(message)s")
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+DEFAULT_LOGGING_CONFIG = {
+    "formatters": {
+        "vllm": {
+            "class": "tpu_inference.vllm_logging_utils.NewLineFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
+    },
+    "handlers": {
+        "vllm": {
+            "class": "logging.StreamHandler",
+            "formatter": "vllm",
+            "level": VLLM_LOGGING_LEVEL,
+            "stream": "ext://sys.stdout",
+        },
+    },
+    "loggers": {
+        "vllm": {
+            "handlers": ["vllm"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+    },
+    "version": 1,
+    "disable_existing_loggers": False
+}
+@lru_cache
+def _print_debug_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.debug(msg, *args, stacklevel=2)
+@lru_cache
+def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, *args, stacklevel=2)
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, *args, stacklevel=2)
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the [`logging.Logger`][]
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+    def debug_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`debug`][logging.Logger.debug], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_debug_once(self, msg, *args)
+    def info_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`info`][logging.Logger.info], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_info_once(self, msg, *args)
+    def warning_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`warning`][logging.Logger.warning], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_warning_once(self, msg, *args)
+# Pre-defined methods mapping to avoid repeated dictionary creation
+_METHODS_TO_PATCH = {
+    "debug_once": _print_debug_once,
+    "info_once": _print_info_once,
+    "warning_once": _print_warning_once,
+}
+def _configure_vllm_root_logger() -> None:
+    logging_config = dict[str, Any]()
+    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+        raise RuntimeError(
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
+            "implies VLLM_CONFIGURE_LOGGING. Please enable "
+            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
+    if VLLM_CONFIGURE_LOGGING:
+        logging_config = DEFAULT_LOGGING_CONFIG
+    if VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+            raise RuntimeError(
+                "Could not load logging config. File does not exist: %s",
+                VLLM_LOGGING_CONFIG_PATH)
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+            custom_config = json.loads(file.read())
+        if not isinstance(custom_config, dict):
+            raise ValueError("Invalid logging config. Expected dict, got %s.",
+                             type(custom_config).__name__)
+        logging_config = custom_config
+    for formatter in logging_config.get("formatters", {}).values():
+        # This provides backwards compatibility after #10134.
+        if formatter.get(
+                "class"
+        ) == "tpu_inference.vllm_logging_utils.NewLineFormatter":
+            formatter[
+                "class"] = "tpu_inference.mock.vllm_logging_utils.NewLineFormatter"
+    if logging_config:
+        dictConfig(logging_config)
+# The root logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_configure_vllm_root_logger()
+def init_logger(name: str) -> _VllmLogger:
+    """The main purpose of this function is to ensure that loggers are
+    retrieved in such a way that we can be sure the root vllm logger has
+    already been configured."""
+    logger = logging.getLogger(name)
+    for method_name, method in _METHODS_TO_PATCH.items():
+        setattr(logger, method_name, MethodType(method, logger))
+    return cast(_VllmLogger, logger)
+logger = init_logger(__name__)
+def _trace_calls(log_path, root_dir, frame, event, arg=None):
+    if event in ['call', 'return']:
+        # Extract the filename, line number, function name, and the code object
+        filename = frame.f_code.co_filename
+        lineno = frame.f_lineno
+        func_name = frame.f_code.co_name
+        if not filename.startswith(root_dir):
+            # only log the functions in the vllm root_dir
+            return
+        # Log every function call or return
+        try:
+            last_frame = frame.f_back
+            if last_frame is not None:
+                last_filename = last_frame.f_code.co_filename
+                last_lineno = last_frame.f_lineno
+                last_func_name = last_frame.f_code.co_name
+            else:
+                # initial frame
+                last_filename = ""
+                last_lineno = 0
+                last_func_name = ""
+            with open(log_path, 'a') as f:
+                ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+                if event == 'call':
+                    f.write(f"{ts} Call to"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" from {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+                else:
+                    f.write(f"{ts} Return from"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" to {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+        except NameError:
+            # modules are deleted during shutdown
+            pass
+    return partial(_trace_calls, log_path, root_dir)
+def enable_trace_function_call(log_file_path: str, root_dir: str):
+    sys.settrace(partial(_trace_calls, log_file_path, root_dir))

tpu_inference/mock/vllm_logging_utils.py ADDED Viewed

@@ -0,0 +1,15 @@
+import logging
+class NewLineFormatter(logging.Formatter):
+    """Adds logging prefix to newlines to align multi-line messages."""
+    def __init__(self, fmt, datefmt=None, style="%"):
+        logging.Formatter.__init__(self, fmt, datefmt, style)
+    def format(self, record):
+        msg = logging.Formatter.format(self, record)
+        if record.message != "":
+            parts = msg.split(record.message)
+            msg = msg.replace("\n", "\r\n" + parts[0])
+        return msg

tpu_inference/models/common/model_loader.py CHANGED Viewed

@@ -8,13 +8,10 @@ from jax.sharding import Mesh, NamedSharding, PartitionSpec
 from torchax.ops.mappings import j2t_dtype
 from transformers import PretrainedConfig
 from vllm.config import VllmConfig
-from vllm.model_executor.model_loader import get_model_loader
-from vllm.model_executor.model_loader.runai_streamer_loader import \
-    RunaiModelStreamerLoader
 from vllm.utils.func_utils import supports_kw
 from tpu_inference import envs
-from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.jax.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.quantization.quantization_utils import (
     apply_qwix_on_abstract_model, apply_qwix_quantization,
@@ -39,17 +36,19 @@ def _get_model_architecture(config: PretrainedConfig) -> nnx.Module:
     from tpu_inference.models.jax.llama3 import LlamaForCausalLM
     from tpu_inference.models.jax.llama4 import Llama4ForCausalLM
     from tpu_inference.models.jax.llama_eagle3 import EagleLlama3ForCausalLM
-    from tpu_inference.models.jax.llama_guard_4 import LlamaGuard4ForCausalLM
+    from tpu_inference.models.jax.phi3 import Phi3ForCausalLM
+    from tpu_inference.models.jax.qwen2 import Qwen2ForCausalLM
     from tpu_inference.models.jax.qwen2_5_vl import \
         Qwen2_5_VLForConditionalGeneration
     from tpu_inference.models.jax.qwen3 import Qwen3ForCausalLM
     _MODEL_REGISTRY["Llama4ForCausalLM"] = Llama4ForCausalLM
     _MODEL_REGISTRY["DeepseekV3ForCausalLM"] = DeepSeekV3
     _MODEL_REGISTRY["LlamaForCausalLM"] = LlamaForCausalLM
-    _MODEL_REGISTRY["Llama4ForConditionalGeneration"] = LlamaGuard4ForCausalLM
+    _MODEL_REGISTRY["Qwen2ForCausalLM"] = Qwen2ForCausalLM
     _MODEL_REGISTRY["Qwen3ForCausalLM"] = Qwen3ForCausalLM
     _MODEL_REGISTRY[
         "Qwen2_5_VLForConditionalGeneration"] = Qwen2_5_VLForConditionalGeneration
+    _MODEL_REGISTRY["Phi3ForCausalLM"] = Phi3ForCausalLM
     _MODEL_REGISTRY["Eagle3LlamaForCausalLM"] = EagleLlama3ForCausalLM
     _MODEL_REGISTRY["GptOssForCausalLM"] = GptOss
@@ -58,10 +57,8 @@ def _get_model_architecture(config: PretrainedConfig) -> nnx.Module:
         if arch in _MODEL_REGISTRY:
             return _MODEL_REGISTRY[arch]
     raise UnsupportedArchitectureError(
-        f"Model architectures {architectures} not "
-        "registered in tpu-inference. Falling back to vLLM-native "
-        f"Pytorch definition. JAX-native architectures: {list(_MODEL_REGISTRY.keys())}"
-    )
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
 def _get_nnx_model(
@@ -180,23 +177,7 @@ def _get_nnx_model(
         # the model creation again, otherwise the model forward will have
         # non-trivial overhead in PjitFunction.
         with mesh:
-            loader = get_model_loader(vllm_config.load_config)
-            if isinstance(loader, RunaiModelStreamerLoader):
-                model_weights = vllm_config.model_config.model
-                if hasattr(vllm_config.model_config, "model_weights"):
-                    model_weights = vllm_config.model_config.model_weights
-                weights_iterator = loader._get_weights_iterator(
-                    model_weights, vllm_config.model_config.revision)
-                # We set the weights iterator at runtime, to prevent having to change
-                # every model's load_weights signature. This also prevents us from hitting
-                # a TypeError at runtime if you use the RunaiModelStreamerLoader with any
-                # flax_nnx model whose load_weights function does not accept the
-                # weights_iterator keyword argument.
-                vllm_config.model_config.model_weights_iterator = weights_iterator
-                model.load_weights(rng)
-                del vllm_config.model_config.model_weights_iterator
-            else:
-                model.load_weights(rng)
+            model.load_weights(rng)
             jit_model = create_jit_model(
                 model,
                 use_qwix_on_abstract_model=should_apply_qwix_on_abstract_model)
@@ -236,9 +217,7 @@ def get_flax_model(
             hidden_states_sharding,  # aux hidden states
         ),
         donate_argnums=2,  # 0 is graphdef, 1 is state, 2 is kv_cache
-        static_argnums=(
-            7, 10, 11
-        ),  #7 is layer_name_to_kvcache_index, 10 is is_first_rank, 11 is is_last_rank
+        static_argnums=6,  #6 is layer_name_to_kvcache_index
     )
     def run_model(graphdef, state, *args):
         model = nnx.merge(graphdef, state)
@@ -263,11 +242,10 @@ def get_flax_model(
         model = nnx.merge(graphdef, state)
         return model.get_multimodal_embeddings(image_grid_thw, **kwargs)
-    embed_sharding = NamedSharding(mesh, PartitionSpec(None))
     # This function will calculates the embeddings of input texts and then merge with the image embeddings
     @functools.partial(
         jax.jit,
-        out_shardings=(embed_sharding),
+        out_shardings=(logits_sharding),
     )
     def run_get_input_embeddings(graphdef, state, *args, **kwargs):
         model = nnx.merge(graphdef, state)
@@ -347,8 +325,8 @@ def get_model(
             # Convert the error message to a string to check its contents
             error_msg = str(e)
-            logger.warning(error_msg)
+            logger.warning(f"Flax model failed with: '{error_msg}'. "
+                           "Falling back to vLLM implementation.")
             # Fall back to the vLLM model and updating the dtype accordingly
             vllm_config.model_config.dtype = j2t_dtype(
                 vllm_config.model_config.dtype.dtype)
@@ -442,17 +420,6 @@ def register_model(arch: str, model: Any) -> None:
             "This is a JAX model and does not implement the PyTorch forward method."
         )
-    # Same as `forward`, this is a dummy method to satisfy vLLM's type checks.
-    def unimplemented_get_input_embeddings(
-        self,
-        input_ids: "torch.Tensor",
-        positions: "torch.Tensor",
-        inputs_embeds: Optional["torch.Tensor"] = None,
-    ) -> "torch.Tensor":
-        raise NotImplementedError(
-            "This is a JAX model and does not implement the PyTorch get_input_embeddings method."
-        )
     # We need a custom __init__ that only calls torch.nn.Module's init,
     # to avoid triggering JAX logic when vLLM inspects the class.
     def wrapper_init(self, *args, **kwargs):
@@ -466,7 +433,6 @@ def register_model(arch: str, model: Any) -> None:
         {
             "__init__": wrapper_init,
             "forward": unimplemented_forward,
-            "get_input_embeddings": unimplemented_get_input_embeddings,
             # Prevent vLLM from trying to load weights into this dummy class.
             "load_weights": lambda self, *args, **kwargs: None,
         })

tpu_inference/models/jax/llama3.py CHANGED Viewed

@@ -8,10 +8,10 @@ from transformers import LlamaConfig, modeling_flax_utils
 from vllm.config import VllmConfig
 from tpu_inference import utils
-from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
-from tpu_inference.layers.common.sharding import ShardingAxisName
+from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.layers.jax.rope_interface import apply_rope
+from tpu_inference.layers.jax.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.weight_utils import (get_default_maps,
                                                          load_hf_weights)
@@ -368,8 +368,7 @@ class LlamaForCausalLM(nnx.Module):
                 "lm_head": "model.lm_head",
             })
-        metadata_map = get_default_maps(self.vllm_config.model_config,
-                                        self.mesh, mappings)
+        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
         load_hf_weights(vllm_config=self.vllm_config,
                         model=self,
                         metadata_map=metadata_map,

tpu_inference/models/jax/llama_eagle3.py CHANGED Viewed

@@ -194,12 +194,13 @@ class Eagle3LlamaModel(nnx.Module):
 def update_reshape_map_for_eagle3(vllm_config: VllmConfig,
                                   metadata_map: MetadataMap):
-    model_config = vllm_config.speculative_config.draft_model_config
+    model_config = vllm_config.model_config
     hf_config = model_config.hf_config
     num_heads = hf_config.num_attention_heads
     num_kv_heads = hf_config.num_key_value_heads
-    hidden_size = hf_config.hidden_size
+    hidden_size = model_config.get_hidden_size()
     head_dim_original = model_config.get_head_size()
     metadata_map.reshape_map.update({
@@ -304,8 +305,6 @@ class EagleLlama3ForCausalLM(nnx.Module):
             "fc": "model.fc.kernel",
             "lm_head": "lm_head.kernel",
             "d2t": "draft_id_to_target_id",
-            "embed_tokens":
-            "model.embed_tokens.embedding",  # Some checkpoints need this
         }
         # Define keys to keep in original dtype (e.g., float32 for stability)
@@ -313,9 +312,7 @@ class EagleLlama3ForCausalLM(nnx.Module):
             r".*d2t.*",
         ]
-        metadata_map = get_default_maps(
-            self.vllm_config.speculative_config.draft_model_config, self.mesh,
-            mappings)
+        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
         update_reshape_map_for_eagle3(self.vllm_config, metadata_map)
@@ -327,7 +324,7 @@ class EagleLlama3ForCausalLM(nnx.Module):
             is_draft_model=True,
             keep_original_dtype_keys_regex=keep_original_dtype_keys_regex)
-        # If the embedding is not initialized, initialize it with a dummy array here to pass jit compilation. The real weights will be shared from the target model in eagle3 class.
+        # If the embedding is not initialized, initialize it with a dummpy array here to pass jit compilation. The real weights will be shared from the target model in eagle3 class.
         if isinstance(self.model.embed_tokens.embedding.value,
                       jax.ShapeDtypeStruct):
             self.model.embed_tokens.embedding.value = jnp.zeros(

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl