PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (67) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_utils.py +16 -24
tpu_inference/__init__.py +3 -22
tpu_inference/core/core_tpu.py +9 -17
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +11 -31
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +143 -287
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -7
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/{common → jax}/attention_interface.py +2 -8
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/{common → jax}/sharding.py +5 -5
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/__init__.py +3 -7
tpu_inference/layers/vllm/quantization/awq.py +3 -4
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +2 -4
tpu_inference/layers/vllm/quantization/unquantized.py +67 -62
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +12 -46
tpu_inference/models/jax/llama3.py +3 -4
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +2 -3
tpu_inference/models/jax/qwen2_5_vl.py +50 -165
tpu_inference/models/jax/qwen3.py +2 -3
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -32
tpu_inference/platforms/tpu_platform.py +34 -47
tpu_inference/runner/compilation_manager.py +60 -145
tpu_inference/runner/kv_cache.py +2 -2
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +135 -283
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +15 -38
tpu_inference/worker/tpu_worker.py +26 -163
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/RECORD +63 -61
tests/test_envs.py +0 -203
tpu_inference/layers/common/quant_methods.py +0 -8
tpu_inference/layers/vllm/quantization/mxfp4.py +0 -331
tpu_inference/models/jax/llama_guard_4.py +0 -361
/tpu_inference/layers/{common → jax}/binary_search.py +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/awq.py CHANGED Viewed

@@ -18,7 +18,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped, unpack_quantized_values_into_int32)
 from vllm.scalar_type import scalar_types
-from tpu_inference.layers.common.quant_methods import AWQ, get_tpu_quant_method
 from tpu_inference.layers.vllm.linear_common import (
     slice_sharded_tensor_for_concatenation, torch_to_jax_param)
 from tpu_inference.layers.vllm.quantization.common import (
@@ -30,12 +29,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config(get_tpu_quant_method(AWQ))
+@register_quantization_config("jax-awq")
 class VllmAWQConfig(AWQConfig, JaxCommonConfig):
     @classmethod
-    def get_name(cls):
-        return AWQ
+    def get_name(cls) -> str:
+        return "jax-awq"
     def get_supported_act_dtypes(self) -> list[torch.dtype]:
         # NOTE: AWQ checkpoint was quantized with float16. But on TPUs, using

tpu_inference/layers/vllm/quantization/common.py CHANGED Viewed

@@ -61,12 +61,7 @@ class JaxCommonLinearConfig:
                 " bad performance.", type(layer))
         self.bias_sharding = P(self.weight_sharding[0])
-        if isinstance(self.weight_sharding[0], tuple):
-            self.n_shards = 1
-            for axis in self.weight_sharding[0]:
-                self.n_shards *= self.mesh.shape.get(axis, 1)
-        else:
-            self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
+        self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
     def get_input_sharding(self, x: torchax.tensor.Tensor):
         if self.enable_sequence_parallelism:

tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py CHANGED Viewed

@@ -16,8 +16,6 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, should_ignore_layer)
-from tpu_inference.layers.common.quant_methods import (COMPRESSED_TENSORS,
-                                                       get_tpu_quant_method)
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.compressed_tensors.compressed_tensors_moe import \
     VllmCompressedTensorsW8A8Fp8MoEMethod
@@ -32,12 +30,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config(get_tpu_quant_method(COMPRESSED_TENSORS))
+@register_quantization_config("jax-compressed-tensors")
 class VllmCompressedTensorsConfig(CompressedTensorsConfig, JaxCommonConfig):
     @classmethod
     def get_name(cls) -> str:
-        return COMPRESSED_TENSORS
+        return "jax-compressed-tensors"
     def get_scheme(self,
                    layer: torch.nn.Module,

tpu_inference/layers/vllm/quantization/unquantized.py CHANGED Viewed

@@ -23,9 +23,7 @@ from vllm.model_executor.layers.quantization.base_config import (
 from tpu_inference import envs
 from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
-from tpu_inference.layers.common.quant_methods import (UNQUANTIZED,
-                                                       get_tpu_quant_method)
-from tpu_inference.layers.vllm.fused_moe import fused_moe_func
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
 from tpu_inference.layers.vllm.linear_common import (
     reorder_concatenated_tensor_for_sharding,
     slice_sharded_tensor_for_concatenation, torch_to_jax_param)
@@ -36,12 +34,12 @@ P = PartitionSpec
 logger = init_logger(__name__)
-@register_quantization_config(get_tpu_quant_method(UNQUANTIZED))
+@register_quantization_config("jax-unquantized")
 class VllmUnquantizedConfig(QuantizationConfig, JaxCommonConfig):
     @classmethod
     def get_name(cls) -> str:
-        return UNQUANTIZED
+        return "jax-unquantized"
     @classmethod
     def get_supported_act_dtypes(cls) -> list[torch.dtype]:
@@ -108,8 +106,6 @@ class VllmUnquantizedLinearMethod(UnquantizedLinearMethod):
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        assert isinstance(layer, LinearBase)
         with jax.named_scope(layer._get_name()):
             if in_sharding := self.jax_config.get_input_sharding(x):
                 x.shard_(NamedSharding(self.jax_config.mesh, in_sharding))
@@ -168,18 +164,18 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                  ep_axis_name: str = 'model'):
         super().__init__(moe)
         self.mesh = mesh
-        self.use_kernel = envs.USE_MOE_EP_KERNEL and moe.use_ep
+        self.use_kernel = envs.USE_MOE_EP_KERNEL
         self.ep_axis_name = ep_axis_name
         # TODO: Use autotune table once we have it.
         self.block_size = {
-            "bt": 64,
-            "bf": 1024,
-            "bd1": 1536,
-            "bd2": 1536,
-            "btc": 64,
-            "bfc": 1024,
-            "bd1c": 1536,
-            "bd2c": 1536,
+            "bt": 16,
+            "bf": 384,
+            "bd1": 512,
+            "bd2": 512,
+            "btc": 16,
+            "bfc": 384,
+            "bd1c": 256,
+            "bd2c": 256,
         }
     def select_gemm_impl(
@@ -193,11 +189,10 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert isinstance(layer, FusedMoE)
         w13_weight = t2j(layer.w13_weight, use_dlpack=False)
         w2_weight = t2j(layer.w2_weight, use_dlpack=False)
-        num_experts, hidden_size, intermediate_size = w2_weight.shape
         if self.moe.has_bias:
             w13_bias = t2j(layer.w13_bias, use_dlpack=False)
             w2_bias = t2j(layer.w2_bias, use_dlpack=False)
@@ -216,56 +211,76 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                 w3_bias = w13_bias[:, 1::2]
                 w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        if self.use_kernel:
+        if self.use_kernel and layer.use_ep:
             # Kernel expects:
             # w13: (num_experts, 2, hidden_size, intermediate_size)
             # w2: (num_experts, intermediate_size, hidden_size)
             # Current format:
             # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
             # w2_weight: (num_experts, hidden_size, intermediate_size)
+            num_experts = w13_weight.shape[0]
+            intermediate_size = w13_weight.shape[1] // 2
+            hidden_size = w13_weight.shape[2]
+            # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
             w13_reshaped = w13_weight.reshape(num_experts, 2,
                                               intermediate_size, hidden_size)
+            w13_weight_transposed = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
-            # Transpose non-constracting dim to right most dim
-            w13_weight_transposed = jnp.swapaxes(w13_reshaped, 2, 3)
-            w2_weight_transposed = jnp.swapaxes(w2_weight, 1, 2)
+            # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
+            w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
             # Apply EP sharding
-            ep_sharding = NamedSharding(self.mesh, P("model"))
             w13_weight = jax.device_put(
-                w13_weight_transposed, Format(Layout((0, 1, 2, 3)),
-                                              ep_sharding))
-            w2_weight = jax.device_put(w2_weight_transposed,
-                                       Format(Layout((0, 1, 2)), ep_sharding))
+                w13_weight_transposed,
+                Format(Layout((0, 1, 2, 3)),
+                       NamedSharding(self.mesh, P("model", None, None, None))))
+            w2_weight = jax.device_put(
+                w2_weight_transposed,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
             if self.moe.has_bias:
                 w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
+                # Apply EP sharding
                 w13_bias = jax.device_put(
-                    w13_bias, Format(Layout((0, 1, 2)), ep_sharding))
-                w2_bias = jax.device_put(w2_bias,
-                                         Format(Layout((0, 1)), ep_sharding))
-        else:
+                    w13_bias,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+        else:
+            # Original logic for non-kernel path
             if layer.use_ep:
-                ep_sharding = NamedSharding(self.mesh, P("model"))
                 w13_weight = jax.device_put(
-                    w13_weight, Format(Layout((0, 1, 2)), ep_sharding))
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
                 w2_weight = jax.device_put(
-                    w2_weight, Format(Layout((0, 1, 2)), ep_sharding))
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
                 if self.moe.has_bias:
                     w13_bias = jax.device_put(
-                        w13_bias, Format(Layout((0, 1)), ep_sharding))
+                        w13_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P("model", None))))
                     w2_bias = jax.device_put(
-                        w2_bias, Format(Layout((0, 1)), ep_sharding))
+                        w2_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P("model", None))))
             else:
+                intermediate_size = w13_weight.shape[1] // 2
+                assert intermediate_size == w2_weight.shape[-1]
                 output_sizes = [intermediate_size, intermediate_size]
                 n_shards = self.mesh.shape["model"]
                 assert intermediate_size % n_shards == 0
                 w13_weight = reorder_concatenated_tensor_for_sharding(
                     w13_weight, output_sizes, n_shards, dim=1)
                 w13_weight = jax.device_put(
@@ -326,40 +341,30 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        x = jax_view(x)
-        w13_weight = jax_view(layer.w13_weight)
-        w2_weight = jax_view(layer.w2_weight)
-        w13_bias = w2_bias = None
-        if self.moe.has_bias:
-            w13_bias = jax_view(layer.w13_bias)
-            w2_bias = jax_view(layer.w2_bias)
-        gating_output = jax_view(router_logits)
         if self.use_kernel and layer.use_ep:
             output = fused_ep_moe(
                 mesh=self.mesh,
-                tokens=x,
-                w1=w13_weight,
-                w2=w2_weight,
-                b1=w13_bias,
-                b2=w2_bias,
-                gating_output=gating_output,
+                tokens=jax_view(x),
+                w1=jax_view(layer.w13_weight),
+                w2=jax_view(layer.w2_weight),
+                gating_output=jax_view(router_logits),
                 top_k=top_k,
                 ep_axis_name=self.ep_axis_name,
-                renormalize_topk_logits=renormalize,
-                act_fn=activation,
                 **self.block_size,
             )
         else:
-            output = fused_moe_func(
-                hidden_states=x,
-                w1=w13_weight,
-                w2=w2_weight,
-                w1_bias=w13_bias,
-                w2_bias=w2_bias,
-                gating_output=gating_output,
+            # Use the original implementation
+            output = fused_moe_func_padded(
+                jax_view(x),
+                jax_view(layer.w13_weight),
+                jax_view(layer.w2_weight),
+                jax_view(layer.w13_bias) if self.moe.has_bias else None,
+                jax_view(layer.w2_bias) if self.moe.has_bias else None,
+                jax_view(router_logits),
                 topk=top_k,
+                global_num_experts=global_num_experts,
                 renormalize=renormalize,
+                reduce_results=layer.reduce_results,
                 mesh=self.mesh,
                 use_ep=layer.use_ep,
                 activation=activation,

tpu_inference/layers/vllm/sharding.py CHANGED Viewed

@@ -19,7 +19,6 @@ from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from tpu_inference import envs
 from tpu_inference.logger import init_logger
 P = PartitionSpec
@@ -212,7 +211,8 @@ def _shard_module_to_tpu(model: torch.nn.Module, mesh: Mesh) -> None:
 def _sharded_device_put(tensor: jax.Array, sharding) -> jax.Array:
     if isinstance(tensor, tuple):
         return tuple(_sharded_device_put(t, sharding) for t in tensor)
-    multihost_backend = envs.TPU_MULTIHOST_BACKEND
+    import os
+    multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
     if multihost_backend != "ray":
         return jax.device_put(tensor, sharding)

tpu_inference/lora/torch_punica_tpu.py CHANGED Viewed

@@ -239,6 +239,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         lora_index_to_id: list[Optional[int]],
         max_loras: int,
         vocab_size: int,
+        extra_vocab_size: int,
     ):
         # Pad the prompt mapping to avoid running into recompiles on the TPU
         # TODO: Should this happen inside mapping internally? If so how can we
@@ -257,7 +258,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             lora_index_to_id,
             max_loras,
             vocab_size,
-            0,  # extra_vocab_size
+            extra_vocab_size,
             "cpu",
         )
         with torchax.default_env():

tpu_inference/mock/__init__.py ADDED Viewed

File without changes

tpu_inference/mock/vllm_config_utils.py ADDED Viewed

@@ -0,0 +1,28 @@
+from dataclasses import dataclass, field
+from typing import Any, List, Mapping
+@dataclass
+class ModelConfig():
+    max_model_len: int = 2048
+    max_prefill_len: int = 1024
+    prefill_batch_size: int = 1
+    decode_batch_size: int = 1
+    block_size: int = 16
+    num_layers: int = 32
+    num_kv_heads: int = 32
+    head_dim: int = 128
+    vocab_size: int = 32000
+    model: str = "llama3"
+    hf_config: str = ""
+    architectures: List[str] = field(default_factory=list)
+    override_generation_config: dict[str, Any] = field(default_factory=dict)
+    hf_overrides: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class VllmConfig():
+    additional_config: Mapping[str, Any] = field(default_factory=dict)
+    # Set default max_model_len to turn off warnings.
+    model_config: ModelConfig = field(
+        default_factory=lambda: ModelConfig(max_model_len=1024))

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl