PyPI - tpu-inference - Versions diffs - 0.11.1__py3-none-any.whl - Mend

tpu-inference 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (168) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_adapters.py +83 -0
tests/core/test_core_tpu.py +523 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/test_lora.py +123 -0
tests/test_base.py +201 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +218 -0
tests/tpu_backend_test.py +59 -0
tpu_inference/__init__.py +30 -0
tpu_inference/adapters/__init__.py +0 -0
tpu_inference/adapters/vllm_adapters.py +42 -0
tpu_inference/adapters/vllm_config_adapters.py +134 -0
tpu_inference/backend.py +69 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/adapters.py +153 -0
tpu_inference/core/core_tpu.py +776 -0
tpu_inference/core/disagg_executor.py +117 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/di/__init__.py +0 -0
tpu_inference/di/abstracts.py +28 -0
tpu_inference/di/host.py +76 -0
tpu_inference/di/interfaces.py +51 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/tpu_connector.py +699 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +346 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/interfaces/__init__.py +0 -0
tpu_inference/interfaces/cache.py +31 -0
tpu_inference/interfaces/config.py +47 -0
tpu_inference/interfaces/config_parts.py +117 -0
tpu_inference/interfaces/engine.py +51 -0
tpu_inference/interfaces/outputs.py +22 -0
tpu_inference/interfaces/params.py +21 -0
tpu_inference/interfaces/platform.py +74 -0
tpu_inference/interfaces/request.py +39 -0
tpu_inference/interfaces/scheduler.py +31 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1447 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3834 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +47 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +254 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/attention_interface.py +356 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/binary_search.py +295 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +172 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +95 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +69 -0
tpu_inference/layers/jax/sharding.py +406 -0
tpu_inference/layers/jax/transformer_block.py +76 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +184 -0
tpu_inference/layers/vllm/fused_moe.py +399 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +34 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +121 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/unquantized.py +263 -0
tpu_inference/layers/vllm/sharding.py +151 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +308 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1233 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +433 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/llama3.py +366 -0
tpu_inference/models/jax/llama4.py +473 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +976 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +164 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +588 -0
tpu_inference/models/jax/utils/weight_utils.py +510 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +272 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_jax.py +257 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table_jax.py +122 -0
tpu_inference/runner/compilation_manager.py +672 -0
tpu_inference/runner/input_batch_jax.py +435 -0
tpu_inference/runner/kv_cache.py +119 -0
tpu_inference/runner/kv_cache_manager.py +460 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +208 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +250 -0
tpu_inference/runner/structured_decoding_manager.py +89 -0
tpu_inference/runner/tpu_jax_runner.py +771 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +334 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +294 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/_temporary_vllm_compat.py +129 -0
tpu_inference/worker/base.py +100 -0
tpu_inference/worker/tpu_worker_jax.py +321 -0
tpu_inference-0.11.1.dist-info/METADATA +101 -0
tpu_inference-0.11.1.dist-info/RECORD +168 -0
tpu_inference-0.11.1.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dist-info/top_level.txt +2 -0

tpu_inference/mock/vllm_logger.py ADDED Viewed

@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Logging configuration for vLLM."""
+import datetime
+import json
+import logging
+import sys
+from collections.abc import Hashable
+from functools import lru_cache, partial
+from logging import Logger
+from logging.config import dictConfig
+from os import path
+from types import MethodType
+from typing import Any, cast
+import tpu_inference.mock.vllm_envs as envs
+VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
+VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
+VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+           "[%(filename)s:%(lineno)d] %(message)s")
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+DEFAULT_LOGGING_CONFIG = {
+    "formatters": {
+        "vllm": {
+            "class": "tpu_inference.vllm_logging_utils.NewLineFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
+    },
+    "handlers": {
+        "vllm": {
+            "class": "logging.StreamHandler",
+            "formatter": "vllm",
+            "level": VLLM_LOGGING_LEVEL,
+            "stream": "ext://sys.stdout",
+        },
+    },
+    "loggers": {
+        "vllm": {
+            "handlers": ["vllm"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+    },
+    "version": 1,
+    "disable_existing_loggers": False
+}
+@lru_cache
+def _print_debug_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.debug(msg, *args, stacklevel=2)
+@lru_cache
+def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, *args, stacklevel=2)
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, *args, stacklevel=2)
+class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the [`logging.Logger`][]
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
+    def debug_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`debug`][logging.Logger.debug], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_debug_once(self, msg, *args)
+    def info_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`info`][logging.Logger.info], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_info_once(self, msg, *args)
+    def warning_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`warning`][logging.Logger.warning], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_warning_once(self, msg, *args)
+# Pre-defined methods mapping to avoid repeated dictionary creation
+_METHODS_TO_PATCH = {
+    "debug_once": _print_debug_once,
+    "info_once": _print_info_once,
+    "warning_once": _print_warning_once,
+}
+def _configure_vllm_root_logger() -> None:
+    logging_config = dict[str, Any]()
+    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+        raise RuntimeError(
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
+            "implies VLLM_CONFIGURE_LOGGING. Please enable "
+            "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH.")
+    if VLLM_CONFIGURE_LOGGING:
+        logging_config = DEFAULT_LOGGING_CONFIG
+    if VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+            raise RuntimeError(
+                "Could not load logging config. File does not exist: %s",
+                VLLM_LOGGING_CONFIG_PATH)
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+            custom_config = json.loads(file.read())
+        if not isinstance(custom_config, dict):
+            raise ValueError("Invalid logging config. Expected dict, got %s.",
+                             type(custom_config).__name__)
+        logging_config = custom_config
+    for formatter in logging_config.get("formatters", {}).values():
+        # This provides backwards compatibility after #10134.
+        if formatter.get(
+                "class"
+        ) == "tpu_inference.vllm_logging_utils.NewLineFormatter":
+            formatter[
+                "class"] = "tpu_inference.mock.vllm_logging_utils.NewLineFormatter"
+    if logging_config:
+        dictConfig(logging_config)
+# The root logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_configure_vllm_root_logger()
+def init_logger(name: str) -> _VllmLogger:
+    """The main purpose of this function is to ensure that loggers are
+    retrieved in such a way that we can be sure the root vllm logger has
+    already been configured."""
+    logger = logging.getLogger(name)
+    for method_name, method in _METHODS_TO_PATCH.items():
+        setattr(logger, method_name, MethodType(method, logger))
+    return cast(_VllmLogger, logger)
+logger = init_logger(__name__)
+def _trace_calls(log_path, root_dir, frame, event, arg=None):
+    if event in ['call', 'return']:
+        # Extract the filename, line number, function name, and the code object
+        filename = frame.f_code.co_filename
+        lineno = frame.f_lineno
+        func_name = frame.f_code.co_name
+        if not filename.startswith(root_dir):
+            # only log the functions in the vllm root_dir
+            return
+        # Log every function call or return
+        try:
+            last_frame = frame.f_back
+            if last_frame is not None:
+                last_filename = last_frame.f_code.co_filename
+                last_lineno = last_frame.f_lineno
+                last_func_name = last_frame.f_code.co_name
+            else:
+                # initial frame
+                last_filename = ""
+                last_lineno = 0
+                last_func_name = ""
+            with open(log_path, 'a') as f:
+                ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
+                if event == 'call':
+                    f.write(f"{ts} Call to"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" from {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+                else:
+                    f.write(f"{ts} Return from"
+                            f" {func_name} in {filename}:{lineno}"
+                            f" to {last_func_name} in {last_filename}:"
+                            f"{last_lineno}\n")
+        except NameError:
+            # modules are deleted during shutdown
+            pass
+    return partial(_trace_calls, log_path, root_dir)
+def enable_trace_function_call(log_file_path: str, root_dir: str):
+    sys.settrace(partial(_trace_calls, log_file_path, root_dir))

tpu_inference/mock/vllm_logging_utils.py ADDED Viewed

@@ -0,0 +1,15 @@
+import logging
+class NewLineFormatter(logging.Formatter):
+    """Adds logging prefix to newlines to align multi-line messages."""
+    def __init__(self, fmt, datefmt=None, style="%"):
+        logging.Formatter.__init__(self, fmt, datefmt, style)
+    def format(self, record):
+        msg = logging.Formatter.format(self, record)
+        if record.message != "":
+            parts = msg.split(record.message)
+            msg = msg.replace("\n", "\r\n" + parts[0])
+        return msg

tpu_inference/models/__init__.py ADDED Viewed

File without changes

tpu_inference/models/common/__init__.py ADDED Viewed

File without changes

tpu_inference/models/common/model_loader.py ADDED Viewed

@@ -0,0 +1,433 @@
+import functools
+import os
+from typing import Any, Optional
+import jax
+import torch
+from flax import nnx
+from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torchax.ops.mappings import j2t_dtype
+from transformers import PretrainedConfig
+from vllm.config import VllmConfig
+from vllm.utils import supports_kw
+from tpu_inference.logger import init_logger
+from tpu_inference.models.jax.utils.quantization.quantization_utils import (
+    apply_qwix_on_abstract_model, apply_qwix_quantization,
+    load_random_weights_into_qwix_abstract_model)
+logger = init_logger(__name__)
+_MODEL_REGISTRY = {}
+class UnsupportedArchitectureError(ValueError):
+    """Raised when a model architecture is not supported in the registry."""
+    pass
+def _get_model_architecture(config: PretrainedConfig) -> nnx.Module:
+    # NOTE: Use inline imports here, otherwise the normal imports
+    # would cause JAX init failure when using multi hosts with Ray.
+    from tpu_inference.models.jax.deepseek_v3 import DeepSeekV3
+    from tpu_inference.models.jax.llama4 import Llama4ForCausalLM
+    from tpu_inference.models.jax.llama_eagle3 import EagleLlama3ForCausalLM
+    from tpu_inference.models.jax.phi3 import Phi3ForCausalLM
+    from tpu_inference.models.jax.qwen2 import Qwen2ForCausalLM
+    from tpu_inference.models.jax.qwen2_5_vl import \
+        Qwen2_5_VLForConditionalGeneration
+    from tpu_inference.models.jax.qwen3 import Qwen3ForCausalLM
+    if os.getenv("NEW_MODEL_DESIGN", False):
+        from tpu_inference.experimental.llama3_jax_stashed import \
+            LlamaForCausalLM
+    else:
+        from tpu_inference.models.jax.llama3 import LlamaForCausalLM
+    _MODEL_REGISTRY["Llama4ForCausalLM"] = Llama4ForCausalLM
+    _MODEL_REGISTRY["DeepseekV3ForCausalLM"] = DeepSeekV3
+    _MODEL_REGISTRY["LlamaForCausalLM"] = LlamaForCausalLM
+    _MODEL_REGISTRY["Qwen2ForCausalLM"] = Qwen2ForCausalLM
+    _MODEL_REGISTRY["Qwen3ForCausalLM"] = Qwen3ForCausalLM
+    _MODEL_REGISTRY[
+        "Qwen2_5_VLForConditionalGeneration"] = Qwen2_5_VLForConditionalGeneration
+    _MODEL_REGISTRY["Phi3ForCausalLM"] = Phi3ForCausalLM
+    _MODEL_REGISTRY["Eagle3LlamaForCausalLM"] = EagleLlama3ForCausalLM
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        if arch in _MODEL_REGISTRY:
+            return _MODEL_REGISTRY[arch]
+    raise UnsupportedArchitectureError(
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
+def _get_nnx_model(
+    model_class: Any,
+    vllm_config: VllmConfig,
+    rng: jax.Array,
+    mesh: Mesh,
+) -> nnx.Module:
+    def create_abstract_model() -> nnx.Module:
+        """
+        Helper class to create an abstract model for `nnx.eval_shape`.
+        Returns:
+            An abstract model function.
+        """
+        return model_class(vllm_config, rng, mesh)
+    @nnx.jit(donate_argnums=(0, ),
+             static_argnames=('use_qwix_on_abstract_model', ))
+    def create_jit_model(
+            model: nnx.Module,
+            use_qwix_on_abstract_model: bool = False) -> nnx.Module:
+        """
+        Create a jit model.
+        Args:
+            model: The model to jit.
+            use_qwix_on_abstract_model: Whether to apply Qwix on the abstract model.
+        Returns:
+            The jitted model.
+        """
+        state = nnx.state(model)
+        nnx.update(model, state)
+        if not use_qwix_on_abstract_model:
+            # NOTE: if Qwix is not configured, this will be a no-op
+            model = apply_qwix_quantization(vllm_config,
+                                            model,
+                                            rng,
+                                            mesh,
+                                            apply_to_abstract_model=False)
+        return model
+    if os.getenv("JAX_RANDOM_WEIGHTS", False):
+        # Create a sharded model with random inited weights.
+        # TODO: currently Qwen2ForCausalLM is using legacy model implementation
+        # will merge the random init logic when all model are migrated to new model implementation
+        # Handle the case where we want to load in random weights to a Qwix-quantized model.  Here, we
+        # need to run an abstract pass for Qwix first and then load in the random weights.
+        if apply_qwix_on_abstract_model(vllm_config):
+            abstract_model_fn = apply_qwix_quantization(
+                vllm_config,
+                create_abstract_model,
+                rng,
+                mesh,
+                apply_to_abstract_model=True)
+            model = nnx.eval_shape(abstract_model_fn)
+            quantization_config = vllm_config.model_config.hf_config.quantization_config if hasattr(
+                vllm_config.model_config.hf_config,
+                "quantization_config") else {}
+            load_random_weights_into_qwix_abstract_model(
+                rng, model, mesh, quantization_config)
+            with mesh:
+                jit_model = create_jit_model(model,
+                                             use_qwix_on_abstract_model=True)
+            return jit_model
+        @nnx.jit
+        def create_sharded_model():
+            model = model_class(vllm_config, rng, mesh)
+            state = nnx.state(model)
+            pspecs = nnx.get_partition_spec(state)
+            sharded_state = jax.lax.with_sharding_constraint(state, pspecs)
+            nnx.update(model, sharded_state)
+            # NOTE: we don't support quantization for the old Qwen2ForCausalLM implementation
+            return model
+        with mesh:
+            jit_model = create_sharded_model()
+            # In this case, we are applying Qwix quantization to the true, concrete model
+            jit_model = apply_qwix_quantization(vllm_config,
+                                                jit_model,
+                                                rng,
+                                                mesh,
+                                                apply_to_abstract_model=False)
+            if hasattr(jit_model, 'initialize_cache'):
+                jit_model.initialize_cache()
+    else:
+        # We first create an abstract model without allocating any weights,
+        # then fill in its weigths during load_weights from HF.
+        # This shows 2 advantages than the normal way:
+        # 1. The model weights will only be allocated once. Otherwise the normal way
+        #    will random-init the model weights first, then load the real weights.
+        #    The two pass weights allocation causes model loading slow.
+        # 2. The model loading won't be OOM. Otherwise the normal way will hold
+        #    a full model weights after random-init, then duplicate a layer during
+        #    the load_weights. This would be easy to OOM if the layer is super large.
+        abstract_model_fn = create_abstract_model
+        # NOTE: only one of the abstract (this) or or concrete Qwix quantization paths should
+        # be taken
+        if should_apply_qwix_on_abstract_model := apply_qwix_on_abstract_model(
+                vllm_config):
+            # NOTE: if Qwix is not configured, this will return `create_abstract_model` and
+            # thus be a no-op
+            abstract_model_fn = apply_qwix_quantization(
+                vllm_config,
+                create_abstract_model,
+                rng,
+                mesh,
+                apply_to_abstract_model=True)
+        model = nnx.eval_shape(abstract_model_fn)
+        # Although the created model can already work, we still need to jit
+        # the model creation again, otherwise the model forward will have
+        # non-trivial overhead in PjitFunction.
+        with mesh:
+            model.load_weights(rng)
+            jit_model = create_jit_model(
+                model,
+                use_qwix_on_abstract_model=should_apply_qwix_on_abstract_model)
+    return jit_model
+# TODO(pooyam): We need to refactor this. This is returning a bunch of functions that do not work with all models and this is not very easy to see from the code.
+def get_flax_model(
+    vllm_config: VllmConfig,
+    rng: jax.Array,
+    mesh: Mesh,
+    is_draft_model: bool = False,
+) -> nnx.Module:
+    if is_draft_model:
+        model_class = _get_model_architecture(
+            vllm_config.speculative_config.draft_model_config.hf_config)
+    else:
+        model_class = _get_model_architecture(
+            vllm_config.model_config.hf_config)
+    jit_model = _get_nnx_model(model_class, vllm_config, rng, mesh)
+    kv_cache_sharding = NamedSharding(mesh, PartitionSpec(None, None, "model"))
+    hidden_states_sharding = NamedSharding(mesh, PartitionSpec(None,
+                                                               None))  # (T, D)
+    # For performance consideration, refer to:
+    # https://flax.readthedocs.io/en/latest/guides/performance.html
+    graphdef, state = nnx.split(jit_model)
+    @functools.partial(
+        jax.jit,
+        out_shardings=(
+            kv_cache_sharding,
+            hidden_states_sharding,
+            hidden_states_sharding,  # aux hidden states
+        ),
+        donate_argnums=2,  # 0 is graphdef, 1 is state, 2 is kv_cache
+        static_argnums=6,  #6 is layer_name_to_kvcache_index
+    )
+    def run_model(graphdef, state, *args):
+        model = nnx.merge(graphdef, state)
+        return model(*args)
+    logits_sharding = NamedSharding(mesh, PartitionSpec(None, "model"))
+    @functools.partial(
+        jax.jit,
+        out_shardings=(logits_sharding),
+    )
+    def run_compute_logits(graphdef, state, *args):
+        model = nnx.merge(graphdef, state)
+        hidden_state, *_ = args
+        return model.compute_logits(hidden_state)
+    # Multi-modal support only
+    # This function calculates the image token's embeddings by VIT
+    def run_get_multimodal_embeddings(graphdef, state, image_grid_thw,
+                                      **kwargs):
+        model = nnx.merge(graphdef, state)
+        return model.get_multimodal_embeddings(image_grid_thw, **kwargs)
+    # This function will calculates the embeddings of input texts and then merge with the image embeddings
+    @functools.partial(
+        jax.jit,
+        out_shardings=(logits_sharding),
+    )
+    def run_get_input_embeddings(graphdef, state, *args, **kwargs):
+        model = nnx.merge(graphdef, state)
+        return model.get_input_embeddings(*args, **kwargs)
+    # For models that want to work with EAGLE-3 speculative decoding
+    @functools.partial(
+        jax.jit,
+        out_shardings=(logits_sharding),
+    )
+    def combine_hidden_states(graphdef, state, hidden_states):
+        model = nnx.merge(graphdef, state)
+        return model.combine_hidden_states(hidden_states)
+    model_fn = functools.partial(run_model, graphdef)
+    compute_logits_fn = functools.partial(run_compute_logits, graphdef)
+    get_multimodal_embeddings_fn = functools.partial(
+        run_get_multimodal_embeddings, graphdef)
+    get_input_embeddings_fn = functools.partial(run_get_input_embeddings,
+                                                graphdef)
+    lora_manager, model = None, None
+    combine_hidden_states_fn = functools.partial(combine_hidden_states,
+                                                 graphdef)
+    get_mrope_input_positions_fn = None if not hasattr(
+        model_class,
+        "get_mrope_input_positions") else model_class.get_mrope_input_positions
+    return model_fn, compute_logits_fn, combine_hidden_states_fn, get_multimodal_embeddings_fn, get_input_embeddings_fn, get_mrope_input_positions_fn, state, lora_manager, model
+def get_vllm_model(
+    vllm_config: VllmConfig,
+    rng: jax.Array,
+    mesh: Mesh,
+):
+    from tpu_inference.models.vllm.vllm_model_wrapper import VllmModelWrapper
+    model = VllmModelWrapper(
+        vllm_config=vllm_config,
+        rng=rng,
+        mesh=mesh,
+    )
+    params, lora_manager = model.load_weights()
+    jit_model = model.jit_step_func()
+    compute_logits_fn = model.jit_compute_logits_func()
+    # the model needs to be returned because lora weights are neither torch.nn.parameter nor torch.nn.buffer. After we load the lora weights and set it to the torch.nn.Module, we can shard it and move it to TPU.
+    combine_hidden_states_fn = None
+    return jit_model, compute_logits_fn, combine_hidden_states_fn, None, None, None, params, lora_manager, model
+def get_model(
+    vllm_config: VllmConfig,
+    rng: jax.Array,
+    mesh: Mesh,
+    is_draft_model: bool = False,
+) -> Any:
+    impl = os.getenv("MODEL_IMPL_TYPE", "flax_nnx").lower()
+    logger.info(f"Loading model with MODEL_IMPL_TYPE={impl}")
+    if impl == "flax_nnx":
+        try:
+            # Try to load the flax model first
+            return get_flax_model(vllm_config, rng, mesh, is_draft_model)
+        except UnsupportedArchitectureError as e:
+            # Convert the error message to a string to check its contents
+            error_msg = str(e)
+            logger.warning(f"Flax model failed with: '{error_msg}'. "
+                           "Falling back to vLLM implementation.")
+            # Fall back to the vLLM model and updating the dtype accordingly
+            vllm_config.model_config.dtype = j2t_dtype(
+                vllm_config.model_config.dtype.dtype)
+            return get_vllm_model(vllm_config, rng, mesh)
+    elif impl == "vllm":
+        return get_vllm_model(vllm_config, rng, mesh)
+    else:
+        raise NotImplementedError("Unsupported MODEL_IMPL_TYPE")
+def _validate_model_interface(model: Any) -> None:
+    """Validates that the model class has the required methods and signatures.
+    A valid model must have:
+    - An __init__ method that accepts a 'vllm_config' keyword argument.
+    - A __call__ method that accepts 'kv_caches', 'input_ids', and
+      'attention_metadata' keyword arguments.
+    Args:
+        model: The model class to validate.
+    Raises:
+        TypeError: If the model does not meet the interface requirements.
+    """
+    # Check for __init__ with vllm_config
+    model_init = getattr(model, "__init__", None)
+    if not callable(model_init):
+        raise TypeError(
+            f"Model {model.__name__} must have an __init__ method.")
+    if not supports_kw(model_init, "vllm_config"):
+        raise TypeError(
+            f"Model {model.__name__} __init__ method must accept a "
+            "'vllm_config' keyword argument.")
+    # Check for __call__ with required arguments
+    model_call = getattr(model, "__call__", None)
+    # A class object is always callable (it produces an instance).
+    # We need to check if the class _explicitly_ defines a __call__ method for its
+    # instance, which is different from `type.__call__`.
+    has_defined_call = False
+    if isinstance(model, type):
+        if any("__call__" in C.__dict__ for C in model.__mro__):
+            has_defined_call = True
+    elif callable(model_call):
+        # For an instance, a simple callable check is sufficient.
+        has_defined_call = True
+    if not has_defined_call:
+        raise TypeError(f"Model {model.__name__} must have a __call__ method.")
+    required_call_args = ("kv_caches", "input_ids", "attention_metadata")
+    missing_args = tuple(arg for arg in required_call_args
+                         if not supports_kw(model_call, arg))
+    if missing_args:
+        raise TypeError(
+            f"Model {model.__name__} __call__ method is missing required "
+            f"keyword arguments: {missing_args}")
+def register_model(arch: str, model: Any) -> None:
+    """
+    Registers a model class for a given architecture name.
+    This function registers the model with both the tpu_inference registry
+    and the vLLM registry. For vLLM, it creates a compatible wrapper
+    around the JAX model.
+    Args:
+        arch: The name of the architecture (e.g., "LlamaForCausalLM").
+        model: The JAX model class to register (e.g., a flax.nnx.Module).
+    """
+    _validate_model_interface(model)
+    # Register with tpu_inference registry for the JAX backend
+    _MODEL_REGISTRY[arch] = model
+    # Create a vLLM-compatible wrapper for the JAX model class.
+    # This wrapper inherits from the JAX model and torch.nn.Module
+    # to pass vLLM's type checks. It is not meant to be instantiated
+    # or executed by vLLM's PyTorch backend.
+    def unimplemented_forward(
+        self,
+        input_ids: "torch.Tensor",
+        positions: "torch.Tensor",
+        intermediate_tensors: Optional[Any] = None,
+        inputs_embeds: Optional["torch.Tensor"] = None,
+    ) -> None:
+        raise NotImplementedError(
+            "This is a JAX model and does not implement the PyTorch forward method."
+        )
+    # We need a custom __init__ that only calls torch.nn.Module's init,
+    # to avoid triggering JAX logic when vLLM inspects the class.
+    def wrapper_init(self, *args, **kwargs):
+        torch.nn.Module.__init__(self)
+    # Dynamically create the wrapper class that is a subclass of both the
+    # JAX model and torch.nn.Module.
+    VllmCompatibleModel = type(
+        f"VllmCompatible{model.__name__}",
+        (model, torch.nn.Module),
+        {
+            "__init__": wrapper_init,
+            "forward": unimplemented_forward,
+            # Prevent vLLM from trying to load weights into this dummy class.
+            "load_weights": lambda self, *args, **kwargs: None,
+        })
+    # Register the wrapped model with vLLM's registry.
+    from vllm.model_executor.models.registry import ModelRegistry
+    ModelRegistry.register_model(arch, VllmCompatibleModel)
+    logger.info(
+        f"Registered JAX model {arch} with tpu_inference and vLLM registries.")

tpu_inference/models/jax/__init__.py ADDED Viewed

File without changes