PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511150811__py3-none-any.whl → 0.11.1.dev202512030818__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511150811py3-none-any.whl → 0.11.1.dev202512030818py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (54) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/lora/test_layers.py +0 -6
tests/lora/utils.py +0 -8
tests/test_envs.py +32 -11
tests/test_utils.py +1 -2
tpu_inference/__init__.py +22 -3
tpu_inference/core/disagg_utils.py +6 -8
tpu_inference/distributed/tpu_connector.py +3 -4
tpu_inference/distributed/utils.py +3 -2
tpu_inference/envs.py +61 -8
tpu_inference/executors/ray_distributed_executor.py +31 -11
tpu_inference/kernels/fused_moe/v1/kernel.py +641 -110
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +77 -54
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +213 -126
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +74 -25
tpu_inference/layers/vllm/quantization/common.py +6 -1
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -62
tpu_inference/layers/vllm/quantization/unquantized.py +107 -113
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +1 -2
tpu_inference/models/common/model_loader.py +45 -11
tpu_inference/models/jax/llama3.py +2 -1
tpu_inference/models/jax/llama_eagle3.py +8 -5
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +2 -1
tpu_inference/models/jax/qwen2_5_vl.py +163 -48
tpu_inference/models/jax/qwen3.py +2 -1
tpu_inference/models/jax/utils/quantization/quantization_utils.py +3 -6
tpu_inference/models/jax/utils/weight_utils.py +198 -143
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -7
tpu_inference/platforms/tpu_platform.py +28 -22
tpu_inference/runner/compilation_manager.py +144 -59
tpu_inference/runner/kv_cache_manager.py +17 -18
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +271 -147
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -21
tpu_inference/tpu_info.py +4 -3
tpu_inference/utils.py +36 -13
tpu_inference/worker/tpu_worker.py +162 -25
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/METADATA +3 -2
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/RECORD +48 -53
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +0 -28
tpu_inference/mock/vllm_envs.py +0 -1219
tpu_inference/mock/vllm_logger.py +0 -212
tpu_inference/mock/vllm_logging_utils.py +0 -15
tpu_inference/models/jax/phi3.py +0 -376
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511150811.dist-info → tpu_inference-0.11.1.dev202512030818.dist-info}/top_level.txt +0 -0

tpu_inference/layers/common/sharding.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 import math
-import os
 from dataclasses import asdict, dataclass
 from typing import TYPE_CHECKING, List, Optional
@@ -8,7 +7,7 @@ import jax.numpy as jnp
 import numpy as np
 from jax.sharding import Mesh
-from tpu_inference import utils
+from tpu_inference import envs, utils
 if TYPE_CHECKING:
     from vllm.v1.configs.vllm_config import VllmConfig
@@ -48,7 +47,7 @@ class ShardingAxisName2D:
 try:
-    _use_base_sharding = os.getenv("NEW_MODEL_DESIGN", False)
+    _use_base_sharding = envs.NEW_MODEL_DESIGN
     if _use_base_sharding:
         ShardingAxisName = ShardingAxisNameBase
     else:
@@ -166,9 +165,10 @@ class ShardingConfigManager:
                     f"LoRA is not supported with data parallelism "
                     f"(DP size: {total_dp_size}). Please disable LoRA or "
                     f"set data parallelism to 1.")
-            if not os.environ.get("NEW_MODEL_DESIGN", False):
+        if sharding_strategy.attention_data_parallelism > 1:
+            if not envs.NEW_MODEL_DESIGN:
                 raise ValueError(
-                    "Must run DP with NEW_MODEL_DESIGN enabled. Please set the "
+                    "Must run Attention DP with NEW_MODEL_DESIGN enabled. Please set the "
                     "NEW_MODEL_DESIGN=True.")
     @property

tpu_inference/layers/vllm/fused_moe.py CHANGED Viewed

@@ -110,7 +110,8 @@ def tensor_sharded_gmm_merged_column_parallel(
     # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
     m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
     n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m // mesh.shape["data"], k, n,
+                                                 g)
     _gmm = functools.partial(
         gmm,
@@ -123,14 +124,26 @@ def tensor_sharded_gmm_merged_column_parallel(
     gmm_result = shard_map(
         _gmm,
         mesh=mesh,
-        in_specs=(P(), P(None, "model", None), P()),
-        out_specs=(P(None, "model")),
+        in_specs=(P("data", None), P(None, "model", None), P("data")),
+        out_specs=(P("data", "model")),
         check_rep=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
-        rhs_bis = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
-        gmm_result = (gmm_result + rhs_bis).astype(gmm_result.dtype)
+        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
+            rhs_bis = jnp.repeat(rhs_bias_local,
+                                 group_sizes_global,
+                                 0,
+                                 total_repeat_length=m // mesh.shape["data"])
+            return (gmm_result_local + rhs_bis).astype(gmm_result_local.dtype)
+        gmm_result = shard_map(
+            _add_bias,
+            mesh=mesh,
+            in_specs=(P("data", "model"), P(None, "model"), P("data")),
+            out_specs=(P("data", "model")),
+        )(gmm_result, rhs_bias, group_sizes)
     n_shards = mesh.shape["model"]
     output_sizes = [intermediate_size, intermediate_size]
@@ -150,7 +163,8 @@ def tensor_sharded_gmm_row_parallel(
     # adapted from https://github.com/pytorch/xla/blob/1d409399474197c484894be90b75d9855393dda5/torch_xla/experimental/custom_kernel.py#L1401
     m, k, g = lhs.shape[0], lhs.shape[1], rhs.shape[0]
     n = rhs.shape[1] if transpose_rhs else rhs.shape[2]
-    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m, k, n, g)
+    tm, tk, tn = _get_tiling_size_for_gmm_kernel(m // mesh.shape["data"], k, n,
+                                                 g)
     _gmm = functools.partial(
         gmm,
@@ -167,14 +181,25 @@ def tensor_sharded_gmm_row_parallel(
     gmm_result = shard_map(
         _gmm_all_reduce,
         mesh=mesh,
-        in_specs=(P(None, "model"), P(None, None, "model"), P()),
-        out_specs=(P()),
+        in_specs=(P("data", "model"), P(None, None, "model"), P("data")),
+        out_specs=(P("data")),
         check_rep=False,
     )(lhs, rhs, group_sizes)
     if rhs_bias is not None:
-        rhs_bias = jnp.repeat(rhs_bias, group_sizes, 0, total_repeat_length=m)
-        gmm_result = (gmm_result + rhs_bias).astype(gmm_result.dtype)
+        def _add_bias(gmm_result_local, rhs_bias_local, group_sizes_global):
+            rhs_bis = jnp.repeat(rhs_bias_local,
+                                 group_sizes_global,
+                                 0,
+                                 total_repeat_length=m // mesh.shape["data"])
+            return (gmm_result_local + rhs_bis).astype(gmm_result_local.dtype)
+        gmm_result = shard_map(
+            _add_bias,
+            mesh=mesh,
+            in_specs=(P("data"), P(), P("data")),
+            out_specs=(P("data")),
+        )(gmm_result, rhs_bias, group_sizes)
     return gmm_result
@@ -366,15 +391,27 @@ def fused_moe_func(
         topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdims=True)
     topk_weights = topk_weights.astype(dtype)
-    topk_indices_flat = topk_indices.flatten()
-    topk_argsort_indices = jnp.argsort(topk_indices_flat)
-    topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
-    token_indices = jnp.arange(num_tokens, dtype=jnp.int32).repeat(topk)
-    token_indices_sorted = token_indices[topk_argsort_indices]
-    group_sizes = jnp.bincount(topk_indices_flat, length=global_num_experts)
-    x = hidden_states[token_indices_sorted]
+    def _process_tokens_locally(hidden_states_local, topk_indices_local):
+        num_tokens_local = hidden_states_local.shape[0]
+        topk_indices_flat = topk_indices_local.flatten()
+        topk_argsort_indices = jnp.argsort(topk_indices_flat)
+        topk_argsort_revert_indices = jnp.argsort(topk_argsort_indices)
+        token_indices = jnp.arange(num_tokens_local,
+                                   dtype=jnp.int32).repeat(topk)
+        token_indices_sorted = token_indices[topk_argsort_indices]
+        group_sizes_local = jnp.bincount(topk_indices_flat,
+                                         length=global_num_experts)
+        x = hidden_states_local[token_indices_sorted]
+        return x, group_sizes_local, topk_argsort_revert_indices
+    x, group_sizes, topk_argsort_revert_indices = shard_map(
+        _process_tokens_locally,
+        mesh=mesh,
+        in_specs=(P("data", None), P("data", None)),
+        out_specs=(P("data", None), P("data"), P("data")),
+        check_rep=False,
+    )(hidden_states, topk_indices)
     if use_ep:
         x = expert_sharded_gmm(
             x,
@@ -411,7 +448,7 @@ def fused_moe_func(
         )
     else:
         x = jax.lax.with_sharding_constraint(
-            x, NamedSharding(mesh, P(None, "model")))
+            x, NamedSharding(mesh, P("data", "model")))
         x = tensor_sharded_gmm_row_parallel(
             x,
             w2,
@@ -421,13 +458,25 @@ def fused_moe_func(
             mesh=mesh,
         )
-    x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
-    x = x * jnp.expand_dims(topk_weights, axis=-1)
-    x = x.sum(axis=-2)
+    def _finalize_output(x_local, topk_argsort_revert_indices_local,
+                         topk_weights_local):
+        x_local = x_local[topk_argsort_revert_indices_local].reshape(
+            -1, topk, hidden_size)
+        x_local = x_local * jnp.expand_dims(topk_weights_local, axis=-1)
+        x_local = x_local.sum(axis=-2)
+        return x_local
+    x = shard_map(
+        _finalize_output,
+        mesh=mesh,
+        in_specs=(P("data", None), P("data"), P("data", None)),
+        out_specs=(P("data", None)),
+        check_rep=False,
+    )(x, topk_argsort_revert_indices, topk_weights)
     x = x.reshape(orig_shape)
     if reduce_results:
-        x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, P()))
+        x = jax.lax.with_sharding_constraint(x, NamedSharding(mesh, P("data")))
     return x

tpu_inference/layers/vllm/quantization/common.py CHANGED Viewed

@@ -61,7 +61,12 @@ class JaxCommonLinearConfig:
                 " bad performance.", type(layer))
         self.bias_sharding = P(self.weight_sharding[0])
-        self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
+        if isinstance(self.weight_sharding[0], tuple):
+            self.n_shards = 1
+            for axis in self.weight_sharding[0]:
+                self.n_shards *= self.mesh.shape.get(axis, 1)
+        else:
+            self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
     def get_input_sharding(self, x: torchax.tensor.Tensor):
         if self.enable_sequence_parallelism:

tpu_inference/layers/vllm/quantization/mxfp4.py CHANGED Viewed

@@ -24,6 +24,8 @@ from vllm.model_executor.layers.quantization.mxfp4 import (Mxfp4Backend,
 from vllm.model_executor.layers.quantization.utils.quant_utils import \
     is_layer_skipped
+from tpu_inference import envs
+from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
 from tpu_inference.layers.common.quant_methods import (MXFP4,
                                                        get_tpu_quant_method)
 from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
@@ -93,7 +95,8 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
                 "UnquantizedLinearMethod.")
             return VllmUnquantizedLinearMethod(linear_config)
         elif isinstance(layer, FusedMoE):
-            return VllmMxfp4MoEMethod(layer.moe_config, self.mesh)
+            moe_config = self.get_moe_config(layer)
+            return VllmMxfp4MoEMethod(moe_config, self.mesh)
         elif isinstance(layer, Attention):
             # TODO: Add support for MXFP4 Attention.
             logger.warning_once("MXFP4 attention layer is not implemented. "
@@ -103,13 +106,30 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
 class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
-    def __init__(self, moe: FusedMoEConfig, mesh: Mesh):
+    def __init__(self,
+                 moe: FusedMoEConfig,
+                 mesh: Mesh,
+                 ep_axis_name: str = 'model'):
         FusedMoEMethodBase.__init__(self, moe)
         # We piggyback on triton implementation as it applies minimal hardware
         # specific post processing to the weights.
         self.mxfp4_backend = Mxfp4Backend.TRITON
         self.mesh = mesh
+        self.use_kernel = envs.USE_MOE_EP_KERNEL
+        self.ep_axis_name = ep_axis_name
+        # TODO: Use autotune table once we have it.
+        self.block_size = {
+            "bt": 64,
+            "bf": 1024,
+            "bd1": 1536,
+            "bd2": 1536,
+            "btc": 64,
+            "bfc": 1024,
+            "bd1c": 1536,
+            "bd2c": 1536,
+        }
     def get_fused_moe_quant_config(
             self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None:
@@ -122,6 +142,7 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module):
         assert isinstance(layer, FusedMoE)
+        assert layer.moe_config.has_bias, "mxfp4 quantization alwyas use bias."
         w13_weight = u8_unpack_e2m1(t2j(layer.w13_weight, use_dlpack=False))
         w13_weight_scale = e8m0_to_fp32(
@@ -157,57 +178,95 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
             w3_bias = w13_bias[:, 1::2]
             w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        # TODO(kyuyeunk): Add weight processing logic for the new kernel.
-        if layer.use_ep:
+        if self.use_kernel and layer.use_ep:
+            # Kernel expects:
+            # w13: (num_experts, 2, hidden_size, intermediate_size)
+            # w2: (num_experts, intermediate_size, hidden_size)
+            # Current format:
+            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
+            # w2_weight: (num_experts, hidden_size, intermediate_size)
+            num_experts = w13_weight.shape[0]
+            intermediate_size = w13_weight.shape[1] // 2
+            hidden_size = w13_weight.shape[2]
+            # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
+            w13_reshaped = w13_weight.reshape(num_experts, 2,
+                                              intermediate_size, hidden_size)
+            w13_weight_transposed = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
+            # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
+            w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
+            # Apply EP sharding
             w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P("model", None, None))))
+                w13_weight_transposed,
+                Format(Layout((0, 1, 2, 3)),
+                       NamedSharding(self.mesh, P("model", None, None, None))))
             w2_weight = jax.device_put(
-                w2_weight,
+                w2_weight_transposed,
                 Format(Layout((0, 1, 2)),
                        NamedSharding(self.mesh, P("model", None, None))))
-            w13_bias = jax.device_put(
-                w13_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P("model", None))))
-            w2_bias = jax.device_put(
-                w2_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P("model", None))))
+            if self.moe.has_bias:
+                w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
+                # Apply EP sharding
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
         else:
-            intermediate_size = w13_weight.shape[1] // 2
-            assert intermediate_size == w2_weight.shape[-1]
-            output_sizes = [intermediate_size, intermediate_size]
-            n_shards = self.mesh.shape["model"]
-            assert intermediate_size % n_shards == 0
-            w13_weight = reorder_concatenated_tensor_for_sharding(w13_weight,
-                                                                  output_sizes,
-                                                                  n_shards,
-                                                                  dim=1)
-            w13_weight = jax.device_put(
-                w13_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, "model", None))))
-            w2_weight = jax.device_put(
-                w2_weight,
-                Format(Layout((0, 1, 2)),
-                       NamedSharding(self.mesh, P(None, None, "model"))))
-            w13_bias = reorder_concatenated_tensor_for_sharding(w13_bias,
-                                                                output_sizes,
-                                                                n_shards,
-                                                                dim=1)
-            w13_bias = jax.device_put(
-                w13_bias,
-                Format(Layout((0, 1)),
-                       NamedSharding(self.mesh, P(None, "model"))))
-            w2_bias = jax.device_put(
-                w2_bias,
-                Format(Layout((0, 1)), NamedSharding(self.mesh, P(None,
-                                                                  None))))
+            if layer.use_ep:
+                w13_weight = jax.device_put(
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_weight = jax.device_put(
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+            else:
+                intermediate_size = w13_weight.shape[1] // 2
+                assert intermediate_size == w2_weight.shape[-1]
+                output_sizes = [intermediate_size, intermediate_size]
+                n_shards = self.mesh.shape["model"]
+                assert intermediate_size % n_shards == 0
+                w13_weight = reorder_concatenated_tensor_for_sharding(
+                    w13_weight, output_sizes, n_shards, dim=1)
+                w13_weight = jax.device_put(
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, "model", None))))
+                w2_weight = jax.device_put(
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P(None, None, "model"))))
+                w13_bias = reorder_concatenated_tensor_for_sharding(
+                    w13_bias, output_sizes, n_shards, dim=1)
+                w13_bias = jax.device_put(
+                    w13_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P(None, "model"))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P(None, None))))
         layer.w13_weight = Parameter(torch_view(w13_weight),
                                      requires_grad=False)
@@ -246,21 +305,37 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        # Use the original implementation
-        output = fused_moe_func_padded(
-            jax_view(x),
-            jax_view(layer.w13_weight),
-            jax_view(layer.w2_weight),
-            jax_view(layer.w13_bias) if self.moe.has_bias else None,
-            jax_view(layer.w2_bias) if self.moe.has_bias else None,
-            jax_view(router_logits),
-            topk=top_k,
-            global_num_experts=global_num_experts,
-            renormalize=renormalize,
-            reduce_results=layer.reduce_results,
-            mesh=self.mesh,
-            use_ep=layer.use_ep,
-            activation=activation,
-        )
+        if self.use_kernel and layer.use_ep:
+            output = fused_ep_moe(
+                mesh=self.mesh,
+                tokens=jax_view(x),
+                w1=jax_view(layer.w13_weight),
+                w2=jax_view(layer.w2_weight),
+                b1=jax_view(layer.w13_bias),
+                b2=jax_view(layer.w2_bias),
+                gating_output=jax_view(router_logits),
+                top_k=top_k,
+                ep_axis_name=self.ep_axis_name,
+                renormalize_topk_logits=renormalize,
+                act_fn=activation,
+                **self.block_size,
+            )
+        else:
+            # Use the original implementation
+            output = fused_moe_func_padded(
+                jax_view(x),
+                jax_view(layer.w13_weight),
+                jax_view(layer.w2_weight),
+                jax_view(layer.w13_bias),
+                jax_view(layer.w2_bias),
+                jax_view(router_logits),
+                topk=top_k,
+                global_num_experts=global_num_experts,
+                renormalize=renormalize,
+                reduce_results=layer.reduce_results,
+                mesh=self.mesh,
+                use_ep=layer.use_ep,
+                activation=activation,
+            )
         return torch_view(output)

tpu-inference 0.11.1.dev202511150811__py3-none-any.whl → 0.11.1.dev202512030818__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511150811py3-none-any.whl → 0.11.1.dev202512030818py3-none-any.whl