PyPI - tpu-inference - Versions diffs - 0.11.1__py3-none-any.whl - Mend

tpu-inference 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (168) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_adapters.py +83 -0
tests/core/test_core_tpu.py +523 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/test_lora.py +123 -0
tests/test_base.py +201 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +218 -0
tests/tpu_backend_test.py +59 -0
tpu_inference/__init__.py +30 -0
tpu_inference/adapters/__init__.py +0 -0
tpu_inference/adapters/vllm_adapters.py +42 -0
tpu_inference/adapters/vllm_config_adapters.py +134 -0
tpu_inference/backend.py +69 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/adapters.py +153 -0
tpu_inference/core/core_tpu.py +776 -0
tpu_inference/core/disagg_executor.py +117 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/di/__init__.py +0 -0
tpu_inference/di/abstracts.py +28 -0
tpu_inference/di/host.py +76 -0
tpu_inference/di/interfaces.py +51 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/tpu_connector.py +699 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +346 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/interfaces/__init__.py +0 -0
tpu_inference/interfaces/cache.py +31 -0
tpu_inference/interfaces/config.py +47 -0
tpu_inference/interfaces/config_parts.py +117 -0
tpu_inference/interfaces/engine.py +51 -0
tpu_inference/interfaces/outputs.py +22 -0
tpu_inference/interfaces/params.py +21 -0
tpu_inference/interfaces/platform.py +74 -0
tpu_inference/interfaces/request.py +39 -0
tpu_inference/interfaces/scheduler.py +31 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1447 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3834 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +47 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +254 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/attention_interface.py +356 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/binary_search.py +295 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +172 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +95 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +69 -0
tpu_inference/layers/jax/sharding.py +406 -0
tpu_inference/layers/jax/transformer_block.py +76 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +184 -0
tpu_inference/layers/vllm/fused_moe.py +399 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +34 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +121 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/unquantized.py +263 -0
tpu_inference/layers/vllm/sharding.py +151 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +308 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1233 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +433 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/llama3.py +366 -0
tpu_inference/models/jax/llama4.py +473 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +976 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +164 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +588 -0
tpu_inference/models/jax/utils/weight_utils.py +510 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +272 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_jax.py +257 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table_jax.py +122 -0
tpu_inference/runner/compilation_manager.py +672 -0
tpu_inference/runner/input_batch_jax.py +435 -0
tpu_inference/runner/kv_cache.py +119 -0
tpu_inference/runner/kv_cache_manager.py +460 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +208 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +250 -0
tpu_inference/runner/structured_decoding_manager.py +89 -0
tpu_inference/runner/tpu_jax_runner.py +771 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +334 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +294 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/_temporary_vllm_compat.py +129 -0
tpu_inference/worker/base.py +100 -0
tpu_inference/worker/tpu_worker_jax.py +321 -0
tpu_inference-0.11.1.dist-info/METADATA +101 -0
tpu_inference-0.11.1.dist-info/RECORD +168 -0
tpu_inference-0.11.1.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dist-info/top_level.txt +2 -0

tpu_inference/interfaces/cache.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""
+Defines the abstract contracts for cache managers.
+"""
+from typing import Protocol
+class IKVCacheManager(Protocol):
+    """
+    Abstract contract for a KVCacheManager.
+    """
+    # Add methods and properties from vllm.v1.core.kv_cache_manager.KVCacheManager
+    # that tpu_inference actually uses.
+    ...
+class IEncoderCacheManager(Protocol):
+    """
+    Abstract contract for an EncoderCacheManager.
+    """
+    # Add methods and properties from vllm.v1.core.encoder_cache_manager.EncoderCacheManager
+    # that tpu_inference actually uses.
+    ...
+class IMirroredProcessingCache(Protocol):
+    """
+    Abstract contract for a MirroredProcessingCache.
+    """
+    # Add methods and properties from vllm.v1.engine.mm_input_cache.MirroredProcessingCache
+    # that tpu_inference actually uses.
+    ...

tpu_inference/interfaces/config.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""
+Defines the abstract contract for a configuration object.
+"""
+from typing import Any, Optional, Protocol
+from .config_parts import (ICacheConfig, ICompilationConfig, IModelConfig,
+                           IParallelConfig, ISchedulerConfig,
+                           ISpeculativeConfig)
+class IConfig(Protocol):
+    """
+    A minimal, abstract interface for a configuration object.
+    This protocol defines only the methods and properties that tpu_inference
+    requires to operate. Client libraries (like vLLM) will provide concrete
+    implementations that satisfy this contract.
+    """
+    @property
+    def cache_config(self) -> ICacheConfig:
+        ...
+    @property
+    def compilation_config(self) -> ICompilationConfig:
+        ...
+    @property
+    def model_config(self) -> Optional[IModelConfig]:
+        ...
+    @property
+    def parallel_config(self) -> IParallelConfig:
+        ...
+    @property
+    def scheduler_config(self) -> ISchedulerConfig:
+        ...
+    @property
+    def speculative_config(self) -> Optional[ISpeculativeConfig]:
+        ...
+    # Escape hatch for direct access when needed by the adapter.
+    @property
+    def vllm_config(self) -> Any:
+        ...

tpu_inference/interfaces/config_parts.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""
+Defines the abstract contracts for the component parts of an IConfig.
+"""
+from typing import Any, Optional, Protocol
+import torch
+class IModelConfig(Protocol):
+    @property
+    def dtype(self) -> torch.dtype:
+        ...
+    @dtype.setter
+    def dtype(self, value: torch.dtype) -> None:
+        ...
+    @property
+    def use_mla(self) -> bool:
+        ...
+class ICacheConfig(Protocol):
+    @property
+    def block_size(self) -> Optional[int]:
+        ...
+    @block_size.setter
+    def block_size(self, value: Optional[int]) -> None:
+        ...
+class IParallelConfig(Protocol):
+    @property
+    def worker_cls(self) -> str:
+        ...
+    @worker_cls.setter
+    def worker_cls(self, value: str) -> None:
+        ...
+class ISchedulerConfig(Protocol):
+    @property
+    def max_num_seqs(self) -> int:
+        ...
+    @property
+    def is_multi_step(self) -> bool:
+        ...
+    @property
+    def is_multimodal_model(self) -> bool:
+        ...
+    @property
+    def disable_chunked_mm_input(self) -> bool:
+        ...
+    @disable_chunked_mm_input.setter
+    def disable_chunked_mm_input(self, value: bool) -> None:
+        ...
+    @property
+    def enable_chunked_prefill(self) -> bool:
+        ...
+    @enable_chunked_prefill.setter
+    def enable_chunked_prefill(self, value: bool) -> None:
+        ...
+    @property
+    def chunked_prefill_enabled(self) -> bool:
+        ...
+    @chunked_prefill_enabled.setter
+    def chunked_prefill_enabled(self, value: bool) -> None:
+        ...
+    @property
+    def max_model_len(self) -> int:
+        ...
+    @property
+    def max_num_batched_tokens(self) -> int:
+        ...
+    @max_num_batched_tokens.setter
+    def max_num_batched_tokens(self, value: int) -> None:
+        ...
+class ICompilationConfig(Protocol):
+    @property
+    def level(self) -> Any:
+        ...
+    @level.setter
+    def level(self, value: Any) -> None:
+        ...
+    @property
+    def backend(self) -> str:
+        ...
+    @backend.setter
+    def backend(self, value: str) -> None:
+        ...
+class ISpeculativeConfig(Protocol):
+    ...

tpu_inference/interfaces/engine.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""
+This module defines the engine interface contracts required by tpu_inference.
+"""
+from typing import TYPE_CHECKING, Any, Protocol
+# tpu_inference now depends on its own, locally defined interfaces.
+from .cache import IMirroredProcessingCache
+from .outputs import IStructuredOutputManager
+from .scheduler import IScheduler
+# This block is only processed by type checkers, not at runtime.
+if TYPE_CHECKING:
+    from .outputs import IModelRunnerOutput
+class IEngineProc(Protocol):
+    """
+    A high-level interface for any process that can be launched by a client.
+    It defines the single entry point for starting the process's main loop.
+    """
+    def run_busy_loop(self) -> None:
+        ...
+class IDisaggEngineCoreProc(IEngineProc):
+    """
+    An interface for the disaggregated engine process. It inherits the common
+    IEngineProc contract.
+    """
+    pass
+class IEngineCore(Protocol):
+    """
+    An interface defining the contract for an Engine Core building block.
+    This mirrors the public API of a vLLM Engine Core that is used by the
+    DisaggEngineCoreProc.
+    """
+    scheduler: IScheduler
+    mm_input_cache_server: IMirroredProcessingCache
+    structured_output_manager: IStructuredOutputManager
+    model_executor: Any
+    def execute_model_with_error_logging(self, *args,
+                                         **kwargs) -> "IModelRunnerOutput":
+        ...
+    def shutdown(self) -> None:
+        ...

tpu_inference/interfaces/outputs.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""
+Defines the abstract contracts for model and structured outputs.
+"""
+from typing import Protocol
+class IModelRunnerOutput(Protocol):
+    """
+    Abstract contract for the output of a model runner.
+    """
+    # Add methods and properties from vllm.v1.outputs.ModelRunnerOutput
+    # that tpu_inference actually uses.
+    ...
+class IStructuredOutputManager(Protocol):
+    """
+    Abstract contract for a StructuredOutputManager.
+    """
+    # Add methods and properties from vllm.v1.structured_output.StructuredOutputManager
+    # that tpu_inference actually uses.
+    ...

tpu_inference/interfaces/params.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""
+Defines the abstract contracts for sampling and pooling parameters.
+"""
+from typing import Any, Protocol
+class IPoolingParams(Protocol):
+    """
+    Abstract contract for PoolingParams.
+    """
+    ...
+class ISamplingParams(Protocol):
+    """
+    Abstract contract for SamplingParams.
+    """
+    @property
+    def sampling_type(self) -> Any:
+        ...

tpu_inference/interfaces/platform.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""
+Defines the abstract contract for a hardware platform.
+"""
+from typing import Any, Optional, Protocol, Union
+import torch
+from .config import IConfig
+from .params import IPoolingParams, ISamplingParams
+class IPlatform(Protocol):
+    """
+    A minimal, abstract interface for a hardware platform.
+    """
+    def can_update_inplace(self) -> bool:
+        ...
+    def check_and_update_config(self, vllm_config: IConfig) -> None:
+        ...
+    def get_attn_backend_cls(self, selected_backend: Any, head_size: int,
+                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool, use_spare: bool) -> str:
+        ...
+    def get_device_communicator_cls(self) -> str:
+        ...
+    def get_device_name(self, device_id: int = 0) -> str:
+        ...
+    def get_device_total_memory(self, device_id: int = 0) -> int:
+        ...
+    def get_infinity_values(self, dtype: torch.dtype) -> tuple[float, float]:
+        ...
+    def get_lora_vocab_padding_size(self) -> int:
+        ...
+    def get_punica_wrapper(self) -> str:
+        ...
+    def inference_mode(self) -> Any:
+        ...
+    def is_async_output_supported(self, enforce_eager: Optional[bool]) -> bool:
+        ...
+    def is_kv_cache_dtype_supported(self, kv_cache_dtype: str) -> bool:
+        ...
+    def is_pin_memory_available(self) -> bool:
+        ...
+    def set_device(self, device: torch.device) -> None:
+        ...
+    def supports_v1(self, model_config: Any) -> bool:
+        ...
+    def use_all_gather(self) -> bool:
+        ...
+    def validate_request(
+        self,
+        prompt: Any,
+        params: Union[ISamplingParams, IPoolingParams],
+        processed_inputs: Any,
+    ) -> None:
+        ...

tpu_inference/interfaces/request.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""
+Defines the abstract contract for a Request.
+"""
+from typing import Any, Protocol
+class IRequest(Protocol):
+    """
+    A minimal, abstract interface for a request.
+    This protocol defines only the methods and properties that tpu_inference
+    requires to operate. Client libraries (like vLLM) will provide concrete
+    implementations that satisfy this contract.
+    """
+    @property
+    def vllm_request(self) -> Any:
+        ...
+    def is_finished(self) -> bool:
+        ...
+    def get_request_id(self) -> str:
+        ...
+    # Add mm_hashes. it's used by `if request.mm_hashes is not None:`.
+    # Add other methods and properties from vllm.v1.request.Request that are
+    # actually used by the orchestration logic.
+    # For example:
+    # @property
+    # def prompt(self) -> str: ...
+    #
+    # @property
+    # def prompt_token_ids(self) -> list[int]: ...
+    #
+    # def is_finished(self) -> bool: ...
+    #
+    # ... etc.

tpu_inference/interfaces/scheduler.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""
+This module defines the scheduler interface contract required by tpu_inference.
+"""
+from typing import Dict, Protocol
+# tpu_inference now depends on its own, locally defined interfaces.
+from .cache import IEncoderCacheManager, IKVCacheManager
+from .request import IRequest
+class IScheduler(Protocol):
+    """
+    An extended interface for a scheduler, tailored to the needs
+    of advanced orchestration engines.
+    This contract is defined by tpu_inference and must be implemented by
+    any client library (like vLLM) that wishes to use this orchestrator.
+    """
+    @property
+    def requests(self) -> Dict[str, IRequest]:
+        ...
+    @property
+    def kv_cache_manager(self) -> IKVCacheManager:
+        ...
+    @property
+    def encoder_cache_manager(self) -> IEncoderCacheManager:
+        ...

tpu_inference/kernels/__init__.py ADDED Viewed

File without changes

tpu_inference/kernels/collectives/__init__.py ADDED Viewed

File without changes