PyPI - tpu-inference - Versions diffs - 0.11.1__py3-none-any.whl - Mend

tpu-inference 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (168) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_adapters.py +83 -0
tests/core/test_core_tpu.py +523 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/test_lora.py +123 -0
tests/test_base.py +201 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +218 -0
tests/tpu_backend_test.py +59 -0
tpu_inference/__init__.py +30 -0
tpu_inference/adapters/__init__.py +0 -0
tpu_inference/adapters/vllm_adapters.py +42 -0
tpu_inference/adapters/vllm_config_adapters.py +134 -0
tpu_inference/backend.py +69 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/adapters.py +153 -0
tpu_inference/core/core_tpu.py +776 -0
tpu_inference/core/disagg_executor.py +117 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/di/__init__.py +0 -0
tpu_inference/di/abstracts.py +28 -0
tpu_inference/di/host.py +76 -0
tpu_inference/di/interfaces.py +51 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/tpu_connector.py +699 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +346 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/interfaces/__init__.py +0 -0
tpu_inference/interfaces/cache.py +31 -0
tpu_inference/interfaces/config.py +47 -0
tpu_inference/interfaces/config_parts.py +117 -0
tpu_inference/interfaces/engine.py +51 -0
tpu_inference/interfaces/outputs.py +22 -0
tpu_inference/interfaces/params.py +21 -0
tpu_inference/interfaces/platform.py +74 -0
tpu_inference/interfaces/request.py +39 -0
tpu_inference/interfaces/scheduler.py +31 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1447 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3834 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +47 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +254 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/attention_interface.py +356 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/binary_search.py +295 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +172 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +95 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +69 -0
tpu_inference/layers/jax/sharding.py +406 -0
tpu_inference/layers/jax/transformer_block.py +76 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +184 -0
tpu_inference/layers/vllm/fused_moe.py +399 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +34 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +121 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/unquantized.py +263 -0
tpu_inference/layers/vllm/sharding.py +151 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +308 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1233 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +433 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/llama3.py +366 -0
tpu_inference/models/jax/llama4.py +473 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +976 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +164 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +588 -0
tpu_inference/models/jax/utils/weight_utils.py +510 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +272 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_jax.py +257 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table_jax.py +122 -0
tpu_inference/runner/compilation_manager.py +672 -0
tpu_inference/runner/input_batch_jax.py +435 -0
tpu_inference/runner/kv_cache.py +119 -0
tpu_inference/runner/kv_cache_manager.py +460 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +208 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +250 -0
tpu_inference/runner/structured_decoding_manager.py +89 -0
tpu_inference/runner/tpu_jax_runner.py +771 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +334 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +294 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/_temporary_vllm_compat.py +129 -0
tpu_inference/worker/base.py +100 -0
tpu_inference/worker/tpu_worker_jax.py +321 -0
tpu_inference-0.11.1.dist-info/METADATA +101 -0
tpu_inference-0.11.1.dist-info/RECORD +168 -0
tpu_inference-0.11.1.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dist-info/top_level.txt +2 -0

tpu_inference/core/disagg_executor.py ADDED Viewed

@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+from concurrent.futures import Future
+from multiprocessing import Lock
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import worker_receiver_cache_from_config
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        run_method)
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import AsyncModelRunnerOutput
+from vllm.v1.worker.worker_base import WorkerWrapperBase
+logger = init_logger(__name__)
+class DisaggExecutor(Executor):
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model.
+        """
+        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
+                                               rpc_rank=0)
+        slice_config = getattr(self.vllm_config.device_config, "slice")
+        idx = slice_config[0]
+        jax_devices = slice_config[-1]
+        devices = []
+        if isinstance(idx, int):
+            sizes = slice_config[1]
+            start = sum(sizes[0:idx])
+            end = start + sizes[idx]
+            devices = jax_devices[start:end]
+            setattr(self.vllm_config.device_config, "slice",
+                    (idx + 1, sizes, jax_devices))
+            logger.debug(
+                f"Creating DisaggExecutor with {devices}, index: {start} -> {end}"
+            )
+        elif isinstance(idx, tuple):
+            slice_idx = slice_config[1]
+            sizes = slice_config[2][slice_idx]
+            start_row, start_col = idx
+            selected_devices = []
+            max_row, max_col = 0, 0
+            for device in jax_devices:
+                coords = device.coords
+                max_row = max(max_row, coords[0])
+                max_col = max(max_col, coords[1])
+                if coords[0] >= start_row and coords[0] < start_row + sizes[0]:
+                    if coords[1] >= start_col and coords[
+                            1] < start_col + sizes[1]:
+                        selected_devices.append(device)
+            max_row, max_col = max_row + 1, max_col + 1
+            devices = selected_devices
+            if start_col + sizes[1] >= max_col:
+                start_row += sizes[0]
+                start_col = 0
+            else:
+                start_col += sizes[1]
+            setattr(self.vllm_config.device_config, "slice",
+                    ((start_row, start_col), slice_idx + 1, slice_config[2],
+                     jax_devices))
+            logger.debug(
+                f"Creating DisaggExecutor with {devices}, next start: {((start_row, start_col), slice_idx+1, slice_config[2])}"
+            )
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        local_rank = 0
+        rank = 0
+        is_driver_worker = True
+        kwargs = dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=is_driver_worker,
+            devices=devices,
+        )
+        self.mm_receiver_cache = worker_receiver_cache_from_config(
+            self.vllm_config, MULTIMODAL_REGISTRY, Lock())
+        self.collective_rpc("init_worker", args=([kwargs], ))
+        self.collective_rpc("init_device")
+        self.collective_rpc("load_model")
+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: Tuple = (),
+                       kwargs: Optional[Dict] = None,
+                       non_block: bool = False) -> List[Any]:
+        if kwargs is None:
+            kwargs = {}
+        if not non_block:
+            return [run_method(self.driver_worker, method, args, kwargs)]
+        try:
+            result = run_method(self.driver_worker, method, args, kwargs)
+            if isinstance(result, AsyncModelRunnerOutput):
+                if (async_thread := self.async_output_thread) is not None:
+                    return [async_thread.submit(result.get_output)]
+                result = result.get_output()
+            future = Future[Any]()
+            future.set_result(result)
+        except Exception as e:
+            future = Future[Any]()
+            future.set_exception(e)
+        return [future]
+    def check_health(self) -> None:
+        # DisaggExecutor will always be healthy as long as
+        # it's running.
+        return

tpu_inference/core/disagg_utils.py ADDED Viewed

@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Tuple
+PREFILL_SLICES = 'PREFILL_SLICES'
+DECODE_SLICES = 'DECODE_SLICES'
+def is_disagg_enabled() -> bool:
+    # We triggrer our code path as long as prefill slices are set. This
+    # allows us to test interleave mode effectively with the code path
+    # for comparison purposes.
+    return PREFILL_SLICES in os.environ
+def _parse_slices(slices_str: str) -> Tuple[int, ...]:
+    """Parse slices environment variable and return the a list of integers, each the size of a slice.
+    For example, if slices_str is set to `2x2,2x1,2x4`, we should return `(4, 2, 8)`.
+    Throws exception if the slice str is malformed.
+    """
+    if not slices_str:
+        return ()
+    try:
+        slice_sizes = []
+        for s in slices_str.split(','):
+            dims = s.split('x')
+            if len(dims) == 1:
+                slice_sizes.append(int(dims[0]))
+            elif len(dims) == 2:
+                slice_sizes.append((int(dims[0]), int(dims[1])))
+            else:
+                raise ValueError("Each slice must be in 'N' or 'NxM' format.")
+        return tuple(slice_sizes)
+    except ValueError as e:
+        raise ValueError(f"Malformed slice string: '{slices_str}'") from e
+def get_prefill_slices() -> Tuple[int, ...]:
+    if PREFILL_SLICES not in os.environ:
+        return ()
+    return _parse_slices(os.environ[PREFILL_SLICES])
+def get_decode_slices() -> Tuple[int, ...]:
+    if DECODE_SLICES not in os.environ:
+        return ()
+    return _parse_slices(os.environ[DECODE_SLICES])

tpu_inference/di/__init__.py ADDED Viewed

File without changes

tpu_inference/di/abstracts.py ADDED Viewed

@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC
+class AbstractModelRunnerOutput(ABC):
+    """Abstract base class for model runner output."""
+    pass
+class AbstractSchedulerOutput(ABC):
+    """Abstract base class for scheduler output."""
+    pass
+class AbstractLoRARequest(ABC):
+    """Abstract base class for LoRA request."""
+    pass
+class AbstractKVCacheConfig(ABC):
+    """Abstract base class for KV cache config."""
+    pass
+class AbstractKVCacheSpec(ABC):
+    """Abstract base class for KV cache spec."""
+    pass

tpu_inference/di/host.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Any, Callable, Dict, Type
+class DIHost:
+    """
+    A simple dependency injection host.
+    This host manages a graph of functions, where each function is a provider
+    for a specific data type and declares its own dependencies.
+    """
+    def __init__(self):
+        self._providers: Dict[Type, Callable[..., Any]] = {}
+        self._dependencies: Dict[Callable[..., Any], Dict[str, Type]] = {}
+    def register(self,
+                 provider: Callable[..., Any],
+                 output_type: Type,
+                 dependencies: Dict[str, Type] = None):
+        """
+        Registers a provider function with the host.
+        Args:
+            provider: The function that produces the output.
+            output_type: The data type that the function produces.
+            dependencies: A dictionary mapping argument names of the provider
+                          to the data types they require.
+        """
+        self._providers[output_type] = provider
+        if dependencies:
+            self._dependencies[provider] = dependencies
+    def resolve(self, target_type: Type) -> Any:
+        """
+        Resolves a dependency by creating an instance of the target type.
+        This method will recursively resolve all dependencies required to call
+        the provider for the target type.
+        Args:
+            target_type: The data type to be resolved.
+        Returns:
+            An instance of the target type.
+        """
+        if target_type not in self._providers:
+            raise ValueError(
+                f"No provider registered for type {target_type.__name__}")
+        provider = self._providers[target_type]
+        if provider not in self._dependencies:
+            # Provider has no dependencies, so just call it.
+            return provider()
+        # Resolve dependencies for the provider.
+        kwargs = {}
+        for arg_name, dep_type in self._dependencies[provider].items():
+            kwargs[arg_name] = self.resolve(dep_type)
+        return provider(**kwargs)

tpu_inference/di/interfaces.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import abc
+class HostInterface(abc.ABC):
+    """
+    An interface that the host system (e.g., SGLang, vLLM) must implement.
+    This defines the contract for how the backend can call back into the host.
+    """
+    @abc.abstractmethod
+    def get_next_batch_to_run(self):
+        """
+        The backend calls this to get the next batch of requests to process.
+        """
+        pass
+    @abc.abstractmethod
+    def process_batch_result(self, batch_result):
+        """
+        The backend calls this to return the results of a processed batch.
+        """
+        pass
+class BackendInterface(abc.ABC):
+    """
+    An interface that the backend system (e.g., tpu_inference) must implement.
+    This defines the contract for how the host can call into the backend.
+    """
+    @abc.abstractmethod
+    def launch_tpu_batch(self, batch_to_launch):
+        """
+        The host calls this to launch a batch of requests on the backend.
+        """
+        pass

tpu_inference/distributed/__init__.py ADDED Viewed

File without changes