PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (56) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_envs.py +11 -32
tests/test_utils.py +2 -1
tpu_inference/__init__.py +3 -22
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +2 -9
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +145 -266
tpu_inference/layers/common/attention_interface.py +1 -7
tpu_inference/layers/common/sharding.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/mxfp4.py +73 -138
tpu_inference/layers/vllm/quantization/unquantized.py +64 -58
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +10 -43
tpu_inference/models/jax/llama3.py +1 -2
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +1 -2
tpu_inference/models/jax/qwen2_5_vl.py +48 -163
tpu_inference/models/jax/qwen3.py +1 -2
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +8 -14
tpu_inference/platforms/tpu_platform.py +31 -37
tpu_inference/runner/compilation_manager.py +58 -141
tpu_inference/runner/kv_cache.py +1 -1
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +147 -271
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +13 -36
tpu_inference/worker/tpu_worker.py +25 -162
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/RECORD +55 -50
tpu_inference/models/jax/llama_guard_4.py +0 -361
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511180814.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/mxfp4.py CHANGED Viewed

@@ -24,11 +24,9 @@ from vllm.model_executor.layers.quantization.mxfp4 import (Mxfp4Backend,
 from vllm.model_executor.layers.quantization.utils.quant_utils import \
     is_layer_skipped
-from tpu_inference import envs
-from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
 from tpu_inference.layers.common.quant_methods import (MXFP4,
                                                        get_tpu_quant_method)
-from tpu_inference.layers.vllm.fused_moe import fused_moe_func
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
 from tpu_inference.layers.vllm.linear_common import \
     reorder_concatenated_tensor_for_sharding
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
@@ -87,14 +85,17 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
                     fused_mapping=self.packed_modules_mapping,
             ):
                 return VllmUnquantizedLinearMethod(linear_config)
+            # TODO: Add support for MXFP4 Linear Method.
+            # MXFP4 LinearMethod is available in AMD-Quark, refer to that
+            # implementation if you are interested in enabling MXFP4 here.
             logger.warning_once(
                 "MXFP4 linear layer is not implemented - falling back to "
                 "UnquantizedLinearMethod.")
             return VllmUnquantizedLinearMethod(linear_config)
         elif isinstance(layer, FusedMoE):
-            moe_config = self.get_moe_config(layer)
-            return VllmMxfp4MoEMethod(moe_config, self.mesh)
+            return VllmMxfp4MoEMethod(layer.moe_config, self.mesh)
         elif isinstance(layer, Attention):
+            # TODO: Add support for MXFP4 Attention.
             logger.warning_once("MXFP4 attention layer is not implemented. "
                                 "Skipping quantization for this layer.")
         return None
@@ -102,30 +103,13 @@ class VllmMxfp4Config(Mxfp4Config, JaxCommonConfig):
 class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
-    def __init__(self,
-                 moe: FusedMoEConfig,
-                 mesh: Mesh,
-                 ep_axis_name: str = 'model'):
+    def __init__(self, moe: FusedMoEConfig, mesh: Mesh):
         FusedMoEMethodBase.__init__(self, moe)
         # We piggyback on triton implementation as it applies minimal hardware
         # specific post processing to the weights.
         self.mxfp4_backend = Mxfp4Backend.TRITON
         self.mesh = mesh
-        self.use_kernel = envs.USE_MOE_EP_KERNEL and moe.use_ep
-        self.ep_axis_name = ep_axis_name
-        # TODO: Use autotune table once we have it.
-        self.block_size = {
-            "bt": 64,
-            "bf": 1024,
-            "bd1": 1536,
-            "bd2": 1536,
-            "btc": 64,
-            "bfc": 1024,
-            "bd1c": 1536,
-            "bd2c": 1536,
-        }
     def get_fused_moe_quant_config(
             self, layer: torch.nn.Module) -> FusedMoEQuantConfig | None:
@@ -138,7 +122,6 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
     def process_weights_after_loading(self, layer: torch.nn.Module):
         assert isinstance(layer, FusedMoE)
-        assert layer.moe_config.has_bias, "mxfp4 quantization alwyas use bias."
         w13_weight = u8_unpack_e2m1(t2j(layer.w13_weight, use_dlpack=False))
         w13_weight_scale = e8m0_to_fp32(
@@ -157,8 +140,6 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
         w2_weight = dequantize_block_weight(w2_weight, w2_weight_scale,
                                             MXFP4_BLOCK_SIZE, jnp.bfloat16)
-        num_experts, hidden_size, intermediate_size = w2_weight.shape
         # Because we have dequantized weights, scales are not used anymore.
         delattr(layer, "w13_weight_scale")
         delattr(layer, "w2_weight_scale")
@@ -176,89 +157,63 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
             w3_bias = w13_bias[:, 1::2]
             w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        if self.use_kernel:
-            # Kernel expects:
-            # w13: (num_experts, 2, hidden_size, intermediate_size)
-            # w2: (num_experts, intermediate_size, hidden_size)
-            # Current format:
-            # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
-            # w2_weight: (num_experts, hidden_size, intermediate_size)
-            w13_reshaped = w13_weight.reshape(num_experts, 2,
-                                              intermediate_size, hidden_size)
-            # Transpose non-constracting dim to right most dim
-            w13_weight_transposed = jnp.swapaxes(w13_reshaped, 2, 3)
-            w2_weight_transposed = jnp.swapaxes(w2_weight, 1, 2)
-            # Apply EP sharding
-            ep_sharding = NamedSharding(self.mesh, P("model"))
+        # TODO(kyuyeunk): Add weight processing logic for the new kernel.
+        if layer.use_ep:
             w13_weight = jax.device_put(
-                w13_weight_transposed, Format(Layout((0, 1, 2, 3)),
-                                              ep_sharding))
-            w2_weight = jax.device_put(w2_weight_transposed,
-                                       Format(Layout((0, 1, 2)), ep_sharding))
-            w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
-            w13_bias = jax.device_put(w13_bias,
-                                      Format(Layout((0, 1, 2)), ep_sharding))
-            w2_bias = jax.device_put(w2_bias,
-                                     Format(Layout((0, 1)), ep_sharding))
+                w13_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
+            w13_bias = jax.device_put(
+                w13_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P("model", None))))
+            w2_bias = jax.device_put(
+                w2_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P("model", None))))
         else:
-            if layer.use_ep:
-                ep_sharding = NamedSharding(self.mesh, P("model"))
-                w13_weight = jax.device_put(
-                    w13_weight, Format(Layout((0, 1, 2)), ep_sharding))
-                w2_weight = jax.device_put(
-                    w2_weight, Format(Layout((0, 1, 2)), ep_sharding))
-                w13_bias = jax.device_put(w13_bias,
-                                          Format(Layout((0, 1)), ep_sharding))
-                w2_bias = jax.device_put(w2_bias,
-                                         Format(Layout((0, 1)), ep_sharding))
-            else:
-                output_sizes = [intermediate_size, intermediate_size]
-                n_shards = self.mesh.shape["model"]
-                assert intermediate_size % n_shards == 0
-                w13_weight = reorder_concatenated_tensor_for_sharding(
-                    w13_weight,
-                    output_sizes,
-                    n_shards,
-                    dim=1,
-                )
-                w13_weight = jax.device_put(
-                    w13_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P(None, "model", None))))
-                w2_weight = jax.device_put(
-                    w2_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P(None, None, "model"))))
-                w13_bias = reorder_concatenated_tensor_for_sharding(
-                    w13_bias,
-                    output_sizes,
-                    n_shards,
-                    dim=1,
-                )
-                w13_bias = jax.device_put(
-                    w13_bias,
-                    Format(Layout((0, 1)),
-                           NamedSharding(self.mesh, P(None, "model"))))
-                w2_bias = jax.device_put(
-                    w2_bias,
-                    Format(Layout((0, 1)),
-                           NamedSharding(self.mesh, P(None, None))))
+            intermediate_size = w13_weight.shape[1] // 2
+            assert intermediate_size == w2_weight.shape[-1]
+            output_sizes = [intermediate_size, intermediate_size]
+            n_shards = self.mesh.shape["model"]
+            assert intermediate_size % n_shards == 0
+            w13_weight = reorder_concatenated_tensor_for_sharding(w13_weight,
+                                                                  output_sizes,
+                                                                  n_shards,
+                                                                  dim=1)
+            w13_weight = jax.device_put(
+                w13_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P(None, "model", None))))
+            w2_weight = jax.device_put(
+                w2_weight,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P(None, None, "model"))))
+            w13_bias = reorder_concatenated_tensor_for_sharding(w13_bias,
+                                                                output_sizes,
+                                                                n_shards,
+                                                                dim=1)
+            w13_bias = jax.device_put(
+                w13_bias,
+                Format(Layout((0, 1)),
+                       NamedSharding(self.mesh, P(None, "model"))))
+            w2_bias = jax.device_put(
+                w2_bias,
+                Format(Layout((0, 1)), NamedSharding(self.mesh, P(None,
+                                                                  None))))
         layer.w13_weight = Parameter(torch_view(w13_weight),
                                      requires_grad=False)
-        layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
         layer.w13_bias = Parameter(torch_view(w13_bias), requires_grad=False)
+        layer.w2_weight = Parameter(torch_view(w2_weight), requires_grad=False)
         layer.w2_bias = Parameter(torch_view(w2_bias), requires_grad=False)
         pass
@@ -291,41 +246,21 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        x = jax_view(x)
-        w13_weight = jax_view(layer.w13_weight)
-        w2_weight = jax_view(layer.w2_weight)
-        w13_bias = jax_view(layer.w13_bias)
-        w2_bias = jax_view(layer.w2_bias)
-        gating_output = jax_view(router_logits)
-        if self.use_kernel:
-            output = fused_ep_moe(
-                mesh=self.mesh,
-                tokens=x,
-                w1=w13_weight,
-                w2=w2_weight,
-                b1=w13_bias,
-                b2=w2_bias,
-                gating_output=gating_output,
-                top_k=top_k,
-                ep_axis_name=self.ep_axis_name,
-                renormalize_topk_logits=renormalize,
-                act_fn=activation,
-                **self.block_size,
-            )
-        else:
-            output = fused_moe_func(
-                hidden_states=x,
-                w1=w13_weight,
-                w2=w2_weight,
-                w1_bias=w13_bias,
-                w2_bias=w2_bias,
-                gating_output=gating_output,
-                topk=top_k,
-                renormalize=renormalize,
-                mesh=self.mesh,
-                use_ep=layer.use_ep,
-                activation=activation,
-            )
+        # Use the original implementation
+        output = fused_moe_func_padded(
+            jax_view(x),
+            jax_view(layer.w13_weight),
+            jax_view(layer.w2_weight),
+            jax_view(layer.w13_bias) if self.moe.has_bias else None,
+            jax_view(layer.w2_bias) if self.moe.has_bias else None,
+            jax_view(router_logits),
+            topk=top_k,
+            global_num_experts=global_num_experts,
+            renormalize=renormalize,
+            reduce_results=layer.reduce_results,
+            mesh=self.mesh,
+            use_ep=layer.use_ep,
+            activation=activation,
+        )
         return torch_view(output)

tpu_inference/layers/vllm/quantization/unquantized.py CHANGED Viewed

@@ -25,7 +25,7 @@ from tpu_inference import envs
 from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
 from tpu_inference.layers.common.quant_methods import (UNQUANTIZED,
                                                        get_tpu_quant_method)
-from tpu_inference.layers.vllm.fused_moe import fused_moe_func
+from tpu_inference.layers.vllm.fused_moe import fused_moe_func_padded
 from tpu_inference.layers.vllm.linear_common import (
     reorder_concatenated_tensor_for_sharding,
     slice_sharded_tensor_for_concatenation, torch_to_jax_param)
@@ -108,8 +108,6 @@ class VllmUnquantizedLinearMethod(UnquantizedLinearMethod):
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        assert isinstance(layer, LinearBase)
         with jax.named_scope(layer._get_name()):
             if in_sharding := self.jax_config.get_input_sharding(x):
                 x.shard_(NamedSharding(self.jax_config.mesh, in_sharding))
@@ -168,18 +166,18 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                  ep_axis_name: str = 'model'):
         super().__init__(moe)
         self.mesh = mesh
-        self.use_kernel = envs.USE_MOE_EP_KERNEL and moe.use_ep
+        self.use_kernel = envs.USE_MOE_EP_KERNEL
         self.ep_axis_name = ep_axis_name
         # TODO: Use autotune table once we have it.
         self.block_size = {
-            "bt": 64,
-            "bf": 1024,
-            "bd1": 1536,
-            "bd2": 1536,
-            "btc": 64,
-            "bfc": 1024,
-            "bd1c": 1536,
-            "bd2c": 1536,
+            "bt": 16,
+            "bf": 384,
+            "bd1": 512,
+            "bd2": 512,
+            "btc": 16,
+            "bfc": 384,
+            "bd1c": 256,
+            "bd2c": 256,
         }
     def select_gemm_impl(
@@ -196,8 +194,6 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
         w13_weight = t2j(layer.w13_weight, use_dlpack=False)
         w2_weight = t2j(layer.w2_weight, use_dlpack=False)
-        num_experts, hidden_size, intermediate_size = w2_weight.shape
         if self.moe.has_bias:
             w13_bias = t2j(layer.w13_bias, use_dlpack=False)
             w2_bias = t2j(layer.w2_bias, use_dlpack=False)
@@ -216,56 +212,76 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                 w3_bias = w13_bias[:, 1::2]
                 w13_bias = jnp.concat([w1_bias, w3_bias], axis=1)
-        if self.use_kernel:
+        if self.use_kernel and layer.use_ep:
             # Kernel expects:
             # w13: (num_experts, 2, hidden_size, intermediate_size)
             # w2: (num_experts, intermediate_size, hidden_size)
             # Current format:
             # w13_weight: (num_experts, 2*intermediate_size, hidden_size)
             # w2_weight: (num_experts, hidden_size, intermediate_size)
+            num_experts = w13_weight.shape[0]
+            intermediate_size = w13_weight.shape[1] // 2
+            hidden_size = w13_weight.shape[2]
+            # Reshape and transpose w13_weight to (num_experts, 2, hidden_size, intermediate_size)
             w13_reshaped = w13_weight.reshape(num_experts, 2,
                                               intermediate_size, hidden_size)
+            w13_weight_transposed = jnp.transpose(w13_reshaped, (0, 1, 3, 2))
-            # Transpose non-constracting dim to right most dim
-            w13_weight_transposed = jnp.swapaxes(w13_reshaped, 2, 3)
-            w2_weight_transposed = jnp.swapaxes(w2_weight, 1, 2)
+            # Transpose w2_weight to (num_experts, intermediate_size, hidden_size)
+            w2_weight_transposed = jnp.transpose(w2_weight, (0, 2, 1))
             # Apply EP sharding
-            ep_sharding = NamedSharding(self.mesh, P("model"))
             w13_weight = jax.device_put(
-                w13_weight_transposed, Format(Layout((0, 1, 2, 3)),
-                                              ep_sharding))
-            w2_weight = jax.device_put(w2_weight_transposed,
-                                       Format(Layout((0, 1, 2)), ep_sharding))
+                w13_weight_transposed,
+                Format(Layout((0, 1, 2, 3)),
+                       NamedSharding(self.mesh, P("model", None, None, None))))
+            w2_weight = jax.device_put(
+                w2_weight_transposed,
+                Format(Layout((0, 1, 2)),
+                       NamedSharding(self.mesh, P("model", None, None))))
             if self.moe.has_bias:
                 w13_bias = w13_bias.reshape(num_experts, 2, intermediate_size)
+                # Apply EP sharding
                 w13_bias = jax.device_put(
-                    w13_bias, Format(Layout((0, 1, 2)), ep_sharding))
-                w2_bias = jax.device_put(w2_bias,
-                                         Format(Layout((0, 1)), ep_sharding))
-        else:
+                    w13_bias,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
+                w2_bias = jax.device_put(
+                    w2_bias,
+                    Format(Layout((0, 1)),
+                           NamedSharding(self.mesh, P("model", None))))
+        else:
+            # Original logic for non-kernel path
             if layer.use_ep:
-                ep_sharding = NamedSharding(self.mesh, P("model"))
                 w13_weight = jax.device_put(
-                    w13_weight, Format(Layout((0, 1, 2)), ep_sharding))
+                    w13_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
                 w2_weight = jax.device_put(
-                    w2_weight, Format(Layout((0, 1, 2)), ep_sharding))
+                    w2_weight,
+                    Format(Layout((0, 1, 2)),
+                           NamedSharding(self.mesh, P("model", None, None))))
                 if self.moe.has_bias:
                     w13_bias = jax.device_put(
-                        w13_bias, Format(Layout((0, 1)), ep_sharding))
+                        w13_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P("model", None))))
                     w2_bias = jax.device_put(
-                        w2_bias, Format(Layout((0, 1)), ep_sharding))
+                        w2_bias,
+                        Format(Layout((0, 1)),
+                               NamedSharding(self.mesh, P("model", None))))
             else:
+                intermediate_size = w13_weight.shape[1] // 2
+                assert intermediate_size == w2_weight.shape[-1]
                 output_sizes = [intermediate_size, intermediate_size]
                 n_shards = self.mesh.shape["model"]
                 assert intermediate_size % n_shards == 0
                 w13_weight = reorder_concatenated_tensor_for_sharding(
                     w13_weight, output_sizes, n_shards, dim=1)
                 w13_weight = jax.device_put(
@@ -326,40 +342,30 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
             raise NotImplementedError(
                 "Only softmax is supported for scoring_func")
-        x = jax_view(x)
-        w13_weight = jax_view(layer.w13_weight)
-        w2_weight = jax_view(layer.w2_weight)
-        w13_bias = w2_bias = None
-        if self.moe.has_bias:
-            w13_bias = jax_view(layer.w13_bias)
-            w2_bias = jax_view(layer.w2_bias)
-        gating_output = jax_view(router_logits)
         if self.use_kernel and layer.use_ep:
             output = fused_ep_moe(
                 mesh=self.mesh,
-                tokens=x,
-                w1=w13_weight,
-                w2=w2_weight,
-                b1=w13_bias,
-                b2=w2_bias,
-                gating_output=gating_output,
+                tokens=jax_view(x),
+                w1=jax_view(layer.w13_weight),
+                w2=jax_view(layer.w2_weight),
+                gating_output=jax_view(router_logits),
                 top_k=top_k,
                 ep_axis_name=self.ep_axis_name,
-                renormalize_topk_logits=renormalize,
-                act_fn=activation,
                 **self.block_size,
             )
         else:
-            output = fused_moe_func(
-                hidden_states=x,
-                w1=w13_weight,
-                w2=w2_weight,
-                w1_bias=w13_bias,
-                w2_bias=w2_bias,
-                gating_output=gating_output,
+            # Use the original implementation
+            output = fused_moe_func_padded(
+                jax_view(x),
+                jax_view(layer.w13_weight),
+                jax_view(layer.w2_weight),
+                jax_view(layer.w13_bias) if self.moe.has_bias else None,
+                jax_view(layer.w2_bias) if self.moe.has_bias else None,
+                jax_view(router_logits),
                 topk=top_k,
+                global_num_experts=global_num_experts,
                 renormalize=renormalize,
+                reduce_results=layer.reduce_results,
                 mesh=self.mesh,
                 use_ep=layer.use_ep,
                 activation=activation,

tpu_inference/layers/vllm/sharding.py CHANGED Viewed

@@ -19,7 +19,6 @@ from vllm.lora.layers.base_linear import BaseLinearLayerWithLoRA
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from tpu_inference import envs
 from tpu_inference.logger import init_logger
 P = PartitionSpec
@@ -212,7 +211,8 @@ def _shard_module_to_tpu(model: torch.nn.Module, mesh: Mesh) -> None:
 def _sharded_device_put(tensor: jax.Array, sharding) -> jax.Array:
     if isinstance(tensor, tuple):
         return tuple(_sharded_device_put(t, sharding) for t in tensor)
-    multihost_backend = envs.TPU_MULTIHOST_BACKEND
+    import os
+    multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
     if multihost_backend != "ray":
         return jax.device_put(tensor, sharding)

tpu_inference/lora/torch_punica_tpu.py CHANGED Viewed

@@ -239,6 +239,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         lora_index_to_id: list[Optional[int]],
         max_loras: int,
         vocab_size: int,
+        extra_vocab_size: int,
     ):
         # Pad the prompt mapping to avoid running into recompiles on the TPU
         # TODO: Should this happen inside mapping internally? If so how can we
@@ -257,7 +258,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
             lora_index_to_id,
             max_loras,
             vocab_size,
-            0,  # extra_vocab_size
+            extra_vocab_size,
             "cpu",
         )
         with torchax.default_env():

tpu_inference/mock/__init__.py ADDED Viewed

File without changes

tpu_inference/mock/vllm_config_utils.py ADDED Viewed

@@ -0,0 +1,28 @@
+from dataclasses import dataclass, field
+from typing import Any, List, Mapping
+@dataclass
+class ModelConfig():
+    max_model_len: int = 2048
+    max_prefill_len: int = 1024
+    prefill_batch_size: int = 1
+    decode_batch_size: int = 1
+    block_size: int = 16
+    num_layers: int = 32
+    num_kv_heads: int = 32
+    head_dim: int = 128
+    vocab_size: int = 32000
+    model: str = "llama3"
+    hf_config: str = ""
+    architectures: List[str] = field(default_factory=list)
+    override_generation_config: dict[str, Any] = field(default_factory=dict)
+    hf_overrides: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class VllmConfig():
+    additional_config: Mapping[str, Any] = field(default_factory=dict)
+    # Set default max_model_len to turn off warnings.
+    model_config: ModelConfig = field(
+        default_factory=lambda: ModelConfig(max_model_len=1024))

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511180814__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511180814py3-none-any.whl