PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl → 0.12.0.dev20251224__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222py3-none-any.whl → 0.12.0.dev20251224py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

tests/core/test_dp_scheduler.py +128 -71
tests/e2e/test_data_parallel.py +176 -280
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_speculative_decoding.py +26 -6
tests/layers/jax/test_qwix.py +1 -1
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +36 -21
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +36 -21
tests/layers/vllm/test_mxfp4.py +25 -10
tests/layers/vllm/test_unquantized.py +61 -31
tests/layers/vllm/utils.py +19 -4
tests/models/common/test_model_loader.py +2 -2
tests/models/jax/test_qwen2_5_vl.py +10 -11
tests/runner/test_multimodal_manager.py +3 -3
tests/runner/test_tpu_runner.py +67 -8
tests/runner/test_tpu_runner_dp.py +66 -0
tpu_inference/core/sched/dp_scheduler.py +65 -40
tpu_inference/kernels/mla/v1/kernel.py +7 -26
tpu_inference/layers/common/sharding.py +8 -3
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +3 -3
tpu_inference/layers/jax/attention/gpt_oss_attention.py +3 -3
tpu_inference/layers/jax/attention/llama4_attention.py +3 -4
tpu_inference/layers/jax/sample/sampling.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +51 -47
tpu_inference/layers/vllm/quantization/common.py +14 -13
tpu_inference/layers/vllm/quantization/mxfp4.py +21 -7
tpu_inference/layers/vllm/quantization/unquantized.py +19 -7
tpu_inference/layers/vllm/sharding.py +7 -4
tpu_inference/models/common/model_loader.py +11 -14
tpu_inference/models/jax/llama3.py +13 -10
tpu_inference/models/jax/llama_guard_4.py +1 -1
tpu_inference/models/jax/qwen2.py +3 -2
tpu_inference/models/jax/qwen2_5_vl.py +4 -4
tpu_inference/models/jax/utils/multi_modal_utils.py +4 -4
tpu_inference/models/jax/utils/qwix/qwix_utils.py +3 -3
tpu_inference/models/vllm/vllm_model_wrapper.py +5 -2
tpu_inference/platforms/tpu_platform.py +7 -7
tpu_inference/runner/compilation_manager.py +43 -33
tpu_inference/runner/kv_cache_manager.py +1 -2
tpu_inference/runner/multimodal_manager.py +1 -1
tpu_inference/runner/tpu_runner.py +12 -9
tpu_inference/utils.py +31 -30
tpu_inference/worker/tpu_worker.py +5 -2
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/METADATA +1 -1
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/RECORD +47 -46
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/WHEEL +0 -0
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.12.0.dev20251222.dist-info → tpu_inference-0.12.0.dev20251224.dist-info}/top_level.txt +0 -0

tpu_inference/layers/vllm/quantization/common.py CHANGED Viewed

@@ -25,9 +25,10 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.vllm.linear_common import \
     get_model_matmul_fusion_assignment
-from tpu_inference.utils import TPU_SECOND_LAST_MINOR
+from tpu_inference.utils import TPU_SECOND_LAST_MINOR, get_mesh_shape_product
 # yapf: enable
@@ -49,14 +50,18 @@ class JaxCommonLinearConfig:
         self.input_sharding = None
         self.output_sharding = None
+        self.tp_size = get_mesh_shape_product(self.mesh,
+                                              ShardingAxisName.MLP_TENSOR)
         if isinstance(layer, RowParallelLinear):
-            self.weight_sharding = P(None, "model")
+            self.weight_sharding = P(None, ShardingAxisName.ATTN_HEAD)
             if self.enable_sp:
-                self.output_sharding = P("model", None)
+                self.output_sharding = P(ShardingAxisName.MLP_TENSOR, None)
         elif isinstance(layer, ColumnParallelLinear):
-            self.weight_sharding = P("model", None)
+            self.weight_sharding = P(ShardingAxisName.ATTN_HEAD, None)
             if self.enable_sp:
-                self.input_sharding = P("model", None)
+                self.input_sharding = P(ShardingAxisName.MLP_TENSOR, None)
             if isinstance(layer, MergedColumnParallelLinear) or isinstance(
                     layer, QKVParallelLinear):
@@ -75,18 +80,14 @@ class JaxCommonLinearConfig:
                 " bad performance.", type(layer))
         self.bias_sharding = P(self.weight_sharding[0])
-        if isinstance(self.weight_sharding[0], tuple):
-            self.n_shards = 1
-            for axis in self.weight_sharding[0]:
-                self.n_shards *= self.mesh.shape.get(axis, 1)
-        else:
-            self.n_shards = self.mesh.shape.get(self.weight_sharding[0], 1)
+        self.n_shards = get_mesh_shape_product(self.mesh,
+                                               self.weight_sharding[0])
     def get_input_sharding(self, x: torchax.tensor.Tensor):
         if self.enable_sp:
             token_num = x.shape[0]
             # NOTE(chengjiyao): make sure the sharded token_num is larger than TPU_SECOND_LAST_MINOR
-            if token_num // self.mesh.shape["model"] >= TPU_SECOND_LAST_MINOR:
+            if token_num // self.tp_size >= TPU_SECOND_LAST_MINOR:
                 return self.input_sharding
             else:
                 return None
@@ -96,7 +97,7 @@ class JaxCommonLinearConfig:
         if self.enable_sp:
             token_num = x.shape[0]
             # NOTE(chengjiyao): make sure the sharded token_num is larger than TPU_SECOND_LAST_MINOR
-            if token_num // self.mesh.shape["model"] >= TPU_SECOND_LAST_MINOR:
+            if token_num // self.tp_size >= TPU_SECOND_LAST_MINOR:
                 return self.output_sharding
             else:
                 return None

tpu_inference/layers/vllm/quantization/mxfp4.py CHANGED Viewed

@@ -44,12 +44,14 @@ from tpu_inference.layers.common.quant_methods import (MXFP4,
                                                        get_tpu_quant_method)
 from tpu_inference.layers.common.quantization import (
     dequantize_tensor_from_mxfp4_packed, quantize_tensor)
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.vllm.fused_moe import fused_moe_func
 from tpu_inference.layers.vllm.linear_common import \
     reorder_concatenated_tensor_for_sharding
 from tpu_inference.layers.vllm.quantization.common import JaxCommonConfig
 from tpu_inference.layers.vllm.quantization.unquantized import \
     VllmUnquantizedLinearMethod
+from tpu_inference.utils import get_mesh_shape_product
 REQUANTIZED_BLOCK_SIZE = 512
@@ -256,7 +258,8 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
                 w2_bias = jnp.expand_dims(w2_bias, 1)
                 if layer.use_ep:
-                    ep_sharding = NamedSharding(self.mesh, P("model"))
+                    ep_sharding = NamedSharding(self.mesh,
+                                                P(ShardingAxisName.EXPERT))
                     w13_weight = jax.lax.with_sharding_constraint(
                         w13_weight, ep_sharding)
@@ -275,7 +278,8 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
                 else:
                     output_sizes = [intermediate_size, intermediate_size]
-                    n_shards = self.mesh.shape["model"]
+                    n_shards = get_mesh_shape_product(
+                        self.mesh, ShardingAxisName.MLP_TENSOR)
                     assert intermediate_size % n_shards == 0
                     # Reorder w13 weights so that splitting w1 and w3 output
@@ -301,19 +305,29 @@ class VllmMxfp4MoEMethod(Mxfp4MoEMethod):
                     w13_weight = jax.lax.with_sharding_constraint(
                         w13_weight,
-                        NamedSharding(self.mesh, P(None, "model", None)))
+                        NamedSharding(
+                            self.mesh,
+                            P(None, ShardingAxisName.MLP_TENSOR, None)))
                     w2_weight = jax.lax.with_sharding_constraint(
                         w2_weight,
-                        NamedSharding(self.mesh, P(None, None, "model")))
+                        NamedSharding(
+                            self.mesh,
+                            P(None, None, ShardingAxisName.MLP_TENSOR)))
                     w13_weight_scale = jax.lax.with_sharding_constraint(
                         w13_weight_scale,
-                        NamedSharding(self.mesh, P(None, None, None, "model")))
+                        NamedSharding(
+                            self.mesh,
+                            P(None, None, None, ShardingAxisName.MLP_TENSOR)))
                     w2_weight_scale = jax.lax.with_sharding_constraint(
                         w2_weight_scale,
-                        NamedSharding(self.mesh, P(None, "model", None, None)))
+                        NamedSharding(
+                            self.mesh,
+                            P(None, ShardingAxisName.MLP_TENSOR, None, None)))
                     w13_bias = jax.lax.with_sharding_constraint(
                         w13_bias,
-                        NamedSharding(self.mesh, P(None, None, "model")))
+                        NamedSharding(
+                            self.mesh,
+                            P(None, None, ShardingAxisName.MLP_TENSOR)))
                     w2_bias = jax.lax.with_sharding_constraint(
                         w2_bias, NamedSharding(self.mesh, P(None, None, None)))

tpu_inference/layers/vllm/quantization/unquantized.py CHANGED Viewed

@@ -39,12 +39,14 @@ from tpu_inference import envs
 from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe
 from tpu_inference.layers.common.quant_methods import (UNQUANTIZED,
                                                        get_tpu_quant_method)
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.vllm.fused_moe import fused_moe_func
 from tpu_inference.layers.vllm.linear_common import (
     reorder_concatenated_tensor_for_sharding,
     slice_sharded_tensor_for_concatenation, torch_to_jax_param)
 from tpu_inference.layers.vllm.quantization.common import (
     JaxCommonConfig, JaxCommonLinearConfig)
+from tpu_inference.utils import get_mesh_shape_product
 P = PartitionSpec
 logger = init_logger(__name__)
@@ -307,7 +309,8 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                 w2_bias = jnp.expand_dims(w2_bias, 1)
             if layer.use_ep:
-                ep_sharding = NamedSharding(self.mesh, P("model"))
+                ep_sharding = NamedSharding(self.mesh,
+                                            P(ShardingAxisName.EXPERT))
                 w13_weight = jax.device_put(
                     w13_weight, Format(Layout((0, 1, 2)), ep_sharding))
                 w2_weight = jax.device_put(
@@ -321,19 +324,26 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
             else:
                 output_sizes = [intermediate_size, intermediate_size]
-                n_shards = self.mesh.shape["model"]
+                n_shards = get_mesh_shape_product(self.mesh,
+                                                  ShardingAxisName.MLP_TENSOR)
                 assert intermediate_size % n_shards == 0
                 w13_weight = reorder_concatenated_tensor_for_sharding(
                     w13_weight, output_sizes, n_shards, dim=1)
                 w13_weight = jax.device_put(
                     w13_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P(None, "model", None))))
+                    Format(
+                        Layout((0, 1, 2)),
+                        NamedSharding(
+                            self.mesh,
+                            P(None, ShardingAxisName.MLP_TENSOR, None))))
                 w2_weight = jax.device_put(
                     w2_weight,
-                    Format(Layout((0, 1, 2)),
-                           NamedSharding(self.mesh, P(None, None, "model"))))
+                    Format(
+                        Layout((0, 1, 2)),
+                        NamedSharding(
+                            self.mesh,
+                            P(None, None, ShardingAxisName.MLP_TENSOR))))
                 if self.moe.has_bias:
                     w13_bias = reorder_concatenated_tensor_for_sharding(
@@ -343,7 +353,9 @@ class VllmUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
                         w13_bias,
                         Format(
                             Layout((0, 1, 2)),
-                            NamedSharding(self.mesh, P(None, None, "model"))))
+                            NamedSharding(
+                                self.mesh,
+                                P(None, None, ShardingAxisName.MLP_TENSOR))))
                     w2_bias = jax.device_put(
                         w2_bias,
                         Format(Layout((0, 1, 2)),

tpu_inference/layers/vllm/sharding.py CHANGED Viewed

@@ -34,6 +34,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from tpu_inference import envs
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.logger import init_logger
 P = PartitionSpec
@@ -123,7 +124,8 @@ def _shard_tensor_to_tpu_replicated(tensor: torch.Tensor,
 def _shard_vocab_parallel_embedding(layer: VocabParallelEmbedding,
                                     mesh: Mesh) -> None:
     weight = _convert_to_torchax_and_shard(
-        layer.weight, NamedSharding(mesh, P('model', None)))
+        layer.weight, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR,
+                                            None)))
     layer.weight = Parameter(weight, requires_grad=False)
@@ -132,11 +134,12 @@ def _shard_lm_head(layer: ParallelLMHead, mesh: Mesh):
     # if that config is set, then we should not create new weights but reuse the
     # weight from VocabParallelEmbedding
     weight = _convert_to_torchax_and_shard(
-        layer.weight, NamedSharding(mesh, P('model', None)))
+        layer.weight, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR,
+                                            None)))
     layer.weight = Parameter(weight, requires_grad=False)
     if layer.bias is not None:
-        bias = _convert_to_torchax_and_shard(layer.bias,
-                                             NamedSharding(mesh, P('model')))
+        bias = _convert_to_torchax_and_shard(
+            layer.bias, NamedSharding(mesh, P(ShardingAxisName.MLP_TENSOR)))
         layer.bias = Parameter(bias, requires_grad=False)

tpu_inference/models/common/model_loader.py CHANGED Viewed

@@ -283,10 +283,9 @@ def get_flax_model(
     # Multi-modal support only
     # This function calculates the image token's embeddings by VIT
-    def run_get_multimodal_embeddings(graphdef, state, image_grid_thw,
-                                      **kwargs):
+    def run_embed_multimodal(graphdef, state, image_grid_thw, **kwargs):
         model = nnx.merge(graphdef, state)
-        return model.get_multimodal_embeddings(image_grid_thw, **kwargs)
+        return model.embed_multimodal(image_grid_thw, **kwargs)
     embed_sharding = NamedSharding(mesh, PartitionSpec(None))
     # This function will calculates the embeddings of input texts and then merge with the image embeddings
@@ -294,9 +293,9 @@ def get_flax_model(
         jax.jit,
         out_shardings=(embed_sharding),
     )
-    def run_get_input_embeddings(graphdef, state, *args, **kwargs):
+    def run_embed_input_ids(graphdef, state, *args, **kwargs):
         model = nnx.merge(graphdef, state)
-        return model.get_input_embeddings(*args, **kwargs)
+        return model.embed_input_ids(*args, **kwargs)
     # For models that want to work with EAGLE-3 speculative decoding
     @functools.partial(
@@ -312,10 +311,8 @@ def get_flax_model(
                                            None)
     model_fn = functools.partial(run_model, graphdef)
     compute_logits_fn = functools.partial(run_compute_logits, graphdef)
-    get_multimodal_embeddings_fn = functools.partial(
-        run_get_multimodal_embeddings, graphdef)
-    get_input_embeddings_fn = functools.partial(run_get_input_embeddings,
-                                                graphdef)
+    embed_multimodal_fn = functools.partial(run_embed_multimodal, graphdef)
+    embed_input_ids_fn = functools.partial(run_embed_input_ids, graphdef)
     lora_manager, model = None, None
     combine_hidden_states_fn = functools.partial(combine_hidden_states,
                                                  graphdef)
@@ -326,8 +323,8 @@ def get_flax_model(
     multimodal_fns = {
         "precompile_vision_encoder_fn": precompile_vision_encoder_fn,
-        "get_multimodal_embeddings_fn": get_multimodal_embeddings_fn,
-        "get_input_embeddings_fn": get_input_embeddings_fn,
+        "embed_multimodal_fn": embed_multimodal_fn,
+        "embed_input_ids_fn": embed_input_ids_fn,
         "get_mrope_input_positions_fn": get_mrope_input_positions_fn,
     }
@@ -485,14 +482,14 @@ def register_model(arch: str, model: Any) -> None:
         )
     # Same as `forward`, this is a dummy method to satisfy vLLM's type checks.
-    def unimplemented_get_input_embeddings(
+    def unimplemented_embed_input_ids(
         self,
         input_ids: "torch.Tensor",
         positions: "torch.Tensor",
         inputs_embeds: Optional["torch.Tensor"] = None,
     ) -> "torch.Tensor":
         raise NotImplementedError(
-            "This is a JAX model and does not implement the PyTorch get_input_embeddings method."
+            "This is a JAX model and does not implement the PyTorch embed_input_ids method."
         )
     # We need a custom __init__ that only calls torch.nn.Module's init,
@@ -508,7 +505,7 @@ def register_model(arch: str, model: Any) -> None:
         {
             "__init__": wrapper_init,
             "forward": unimplemented_forward,
-            "get_input_embeddings": unimplemented_get_input_embeddings,
+            "embed_input_ids": unimplemented_embed_input_ids,
             # Prevent vLLM from trying to load weights into this dummy class.
             "load_weights": lambda self, *args, **kwargs: None,
         })

tpu_inference/models/jax/llama3.py CHANGED Viewed

@@ -26,6 +26,7 @@ from tpu_inference import utils
 from tpu_inference.distributed.jax_parallel_state import get_pp_group
 from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.quantization import quantize_kv
 from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.jax.pp_utils import PPMissingLayer, make_layers
 from tpu_inference.layers.jax.rope_interface import apply_rope
@@ -34,6 +35,7 @@ from tpu_inference.models.jax.jax_intermediate_tensor import \
     JaxIntermediateTensors
 from tpu_inference.models.jax.utils.weight_utils import (get_default_maps,
                                                          load_hf_weights)
+from tpu_inference.utils import get_mesh_shape_product
 logger = init_logger(__name__)
@@ -98,7 +100,8 @@ class LlamaAttention(nnx.Module):
                                          self.hidden_size // self.num_heads)
         self.head_dim = utils.get_padded_head_dim(self.head_dim_original)
-        sharding_size = mesh.shape["model"] * mesh.shape.get("attn_dp", 1)
+        sharding_size = get_mesh_shape_product(mesh,
+                                               ShardingAxisName.MLP_TENSOR)
         self.num_heads = utils.get_padded_num_heads(self.num_heads,
                                                     sharding_size)
         self.num_kv_heads = utils.get_padded_num_heads(self.num_kv_heads,
@@ -171,8 +174,8 @@ class LlamaAttention(nnx.Module):
             # q_scale = self._q_scale
             k_scale = self._k_scale
             v_scale = self._v_scale
-            k, v = utils.quantize_kv(k, v, self.kv_cache_quantized_dtype,
-                                     k_scale, v_scale)
+            k, v = quantize_kv(self.kv_cache_quantized_dtype, k, v, k_scale,
+                               v_scale)
         new_kv_cache, outputs = attention(
             kv_cache,
             q,
@@ -369,13 +372,13 @@ class LlamaForCausalLM(nnx.Module):
         kv_caches: List[jax.Array],
         input_ids: jax.Array,
         attention_metadata: AttentionMetadata,
-        _input_embeds,
-        _input_positions,
-        _layer_name_to_kv_cache,
-        _lora_metadata,
-        intermediate_tensors: JaxIntermediateTensors,
-        _is_first_rank: bool,
-        _is_last_rank: bool,
+        _input_embeds=None,
+        _input_positions=None,
+        _layer_name_to_kv_cache=None,
+        _lora_metadata=None,
+        intermediate_tensors: JaxIntermediateTensors | None = None,
+        _is_first_rank: bool | None = None,
+        _is_last_rank: bool | None = None,
         *args,
     ) -> Tuple[List[jax.Array], jax.Array, List[jax.Array]] | Tuple[
             List[jax.Array], JaxIntermediateTensors]:

tpu_inference/models/jax/llama_guard_4.py CHANGED Viewed

@@ -256,7 +256,7 @@ class LlamaGuard4ForCausalLM(nnx.Module):
                             self.lm_head.input_embedding_table_DV.value)
         return logits_TV
-    def get_input_embeddings(
+    def embed_input_ids(
             self,
             input_ids: jax.Array,
             multimodal_embeddings: Optional[List[jax.Array]] = None

tpu_inference/models/jax/qwen2.py CHANGED Viewed

@@ -24,6 +24,7 @@ from vllm.config import VllmConfig
 from tpu_inference import utils
 from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.quantization import quantize_kv
 from tpu_inference.layers.jax.rope_interface import apply_rope
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.weight_utils import (get_default_maps,
@@ -166,8 +167,8 @@ class Qwen2Attention(nnx.Module):
             # q_scale = self._q_scale
             k_scale = self._k_scale
             v_scale = self._v_scale
-            k, v = utils.quantize_kv(k, v, self.kv_cache_quantized_dtype,
-                                     k_scale, v_scale)
+            k, v = quantize_kv(self.kv_cache_quantized_dtype, k, v, k_scale,
+                               v_scale)
         new_kv_cache, outputs = attention(
             kv_cache,
             q,

tpu_inference/models/jax/qwen2_5_vl.py CHANGED Viewed

@@ -1010,9 +1010,9 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
         split_indices = np.cumsum(sizes)[:-1]
         return tuple(jnp.split(image_embeds, split_indices))
-    def get_multimodal_embeddings(self, image_grid_thw: tuple[tuple[int, int,
-                                                                    int], ...],
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, image_grid_thw: tuple[tuple[int, int, int],
+                                                     ...],
+                         **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
             image_grid_thw, **kwargs)
@@ -1036,7 +1036,7 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
         return multimodal_embeddings
-    def get_input_embeddings(
+    def embed_input_ids(
             self, input_ids: jax.Array,
             multimodal_embeddings: Optional[jax.Array]) -> jax.Array:

tpu_inference/models/jax/utils/multi_modal_utils.py CHANGED Viewed

@@ -43,25 +43,25 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
     """
     Perform sanity checks for the result of
-    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
+    [`vllm.model_executor.models.SupportsMultiModal.embed_multimodal`][].
     """
     assert isinstance(mm_embeddings, (list, tuple, jax.Array)), (
         "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
         f"or a single 3D tensor, but got {type(mm_embeddings)} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method.")
     assert len(mm_embeddings) == expected_num_items, (
         "Expected number of multimodal embeddings to match number of "
         f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method.")
     assert all(e.ndim == 2 for e in mm_embeddings), (
         "Expected multimodal embeddings to be a sequence of 2D tensors, "
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method.")
 def flatten_embeddings(embeddings: NestedTensors) -> jax.Array:

tpu_inference/models/jax/utils/qwix/qwix_utils.py CHANGED Viewed

@@ -35,7 +35,7 @@ DEFAULT_NUM_TOKENS_FOR_MODEL_INPUTS = 512
 DEFAULT_MAX_NUM_SEQS_FOR_MODEL_INPUTS = 256
 DEFAULT_MAX_NUM_BLOCKS_PER_REQ = 16
-DEFAULT_DEEPSEEK_FP8_CONFIG = {
+DEFAULT_DEEPSEEK_FP4_MLP_MOE_FP8_ATTN_CONFIG = {
     "qwix": {
         "use_abstract_model":
         True,
@@ -452,7 +452,7 @@ def get_default_qwix_quantization_config(
     # NOTE (jacobplatin): we'll default to mixed FP8 (attention) + FP4 (MoE experts)
     # for DeepSeek
     if model_type == "deepseek_v3" and quant_method == "fp8":
-        config = copy.deepcopy(DEFAULT_DEEPSEEK_FP8_CONFIG)
+        config = copy.deepcopy(DEFAULT_DEEPSEEK_FP4_MLP_MOE_FP8_ATTN_CONFIG)
         # Dynamically fetch block size from HF config if available
         # Config fmt: 'weight_block_size': [1, 512] -> we want the 2nd dim for tile_size
@@ -462,7 +462,7 @@ def get_default_qwix_quantization_config(
         block_size = hf_quant_config["weight_block_size"]
         if isinstance(block_size, (list, tuple)) and len(block_size) == 2:
             assert block_size[
-                0] == 1, f"Expected first dimension to be 1 (unchanneled), but got {block_size[0]}!"
+                0] == 1, f"Expected first dimension to be 1 (unchanneled), but got {block_size[0]}! If you are trying to run quantized DeepSeek, we currently only support 1D-subchannel quantization and those models can be found here: https://huggingface.co/collections/jrplatin/deepseek-r1-1d-subchannel"
             tile_size = block_size[1]
             assert tile_size > 1, f"Expected tile_size > 1 for DeepSeek, but got {tile_size}"
             logger.info(

tpu_inference/models/vllm/vllm_model_wrapper.py CHANGED Viewed

@@ -37,6 +37,7 @@ from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.sequence import IntermediateTensors
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.sharding import ShardingAxisName
 from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
 from tpu_inference.layers.vllm.sharding import shard_model_to_tpu
 from tpu_inference.logger import init_logger
@@ -234,8 +235,10 @@ class VllmModelWrapper:
         @functools.partial(
             jax.jit,
-            out_shardings=(NamedSharding(self.mesh,
-                                         PartitionSpec("data", "model"))),
+            out_shardings=(NamedSharding(
+                self.mesh,
+                PartitionSpec(ShardingAxisName.MLP_DATA,
+                              ShardingAxisName.MLP_TENSOR))),
         )
         def compute_logits_func(
             params_and_buffers: Any,

tpu_inference/platforms/tpu_platform.py CHANGED Viewed

@@ -168,12 +168,12 @@ class TpuPlatform(Platform):
         multihost_backend = envs.TPU_MULTIHOST_BACKEND
         if not multihost_backend:  # Single host
             if parallel_config.pipeline_parallel_size == 1:
-                logger.info("Force using UniProcExecutor for JAX on \
-                        single host without pipeline parallelism.")
+                logger.info("Force using UniProcExecutor for JAX on "
+                            "single host without pipeline parallelism.")
                 parallel_config.distributed_executor_backend = "uni"
             else:
-                logger.info("Force using MultiprocExecutor for JAX on \
-                        single host with pipeline parallelism.")
+                logger.info("Force using MultiprocExecutor for JAX on "
+                            "single host with pipeline parallelism.")
                 parallel_config.distributed_executor_backend = "mp"
         elif multihost_backend == "ray":
             from tpu_inference.executors.ray_distributed_executor import \
@@ -189,9 +189,9 @@ class TpuPlatform(Platform):
         if scheduler_config.is_multimodal_model and not \
             scheduler_config.disable_chunked_mm_input:
-            logger.warning("TPU does not support running Multimodal models"\
-            " without setting `--disable_chunked_mm_input`. " \
-            "Forcing --disable_chunked_mm_input.")
+            logger.warning("TPU does not support running Multimodal models"
+                           " without setting `--disable_chunked_mm_input`. "
+                           "Forcing --disable_chunked_mm_input.")
             scheduler_config.disable_chunked_mm_input = True
         kv_transfer_config = vllm_config.kv_transfer_config

tpu_inference/runner/compilation_manager.py CHANGED Viewed

@@ -127,7 +127,7 @@ class CompilationManager:
             self._run_compilation(
                 "input_embeddings_merger",
-                self.runner.get_input_embeddings_fn,
+                self.runner.embed_input_ids_fn,
                 self.runner.state,
                 dummy_input_ids,
                 dummy_multimodal_embeddings,
@@ -136,7 +136,7 @@ class CompilationManager:
             self._run_compilation(
                 "input_embeddings_merger_text_only",
-                self.runner.get_input_embeddings_fn,
+                self.runner.embed_input_ids_fn,
                 self.runner.state,
                 dummy_input_ids,
                 None,
@@ -495,35 +495,37 @@ class CompilationManager:
             logits = self._create_dummy_tensor((num_reqs, hsize), jnp.bfloat16,
                                                logits_sharding)
             for do_sampling in (True, False):
-                if do_sampling:
-                    temperature = np.full((num_reqs, ), 0.7, dtype=np.float32)
-                    top_k = np.full((num_reqs, ), 20, dtype=np.int32)
-                    top_p = np.full((num_reqs, ), 0.8, dtype=np.float32)
-                    (temperature, top_k,
-                     top_p) = device_array(self.runner.mesh,
-                                           (temperature, top_k, top_p),
-                                           sharding=sampling_metadata_sharding)
-                else:
-                    temperature = None
-                    top_k = None
-                    top_p = None
-                sampling_metadata = TPUSupportedSamplingMetadata(
-                    temperature=temperature,
-                    top_k=top_k,
-                    top_p=top_p,
-                    do_sampling=do_sampling,
-                )
-                self._run_compilation(
-                    f"worker{self.runner.rank} sample",
-                    sample,
-                    self.runner.rng_params_for_sampling,
-                    self.runner.mesh,
-                    logits,
-                    sampling_metadata,
-                    num_reqs=num_reqs,
-                    do_sampling=do_sampling,
-                )
+                for logprobs in (True, False):
+                    if do_sampling:
+                        temperature = np.full((num_reqs, ),
+                                              0.7,
+                                              dtype=np.float32)
+                        top_k = np.full((num_reqs, ), 20, dtype=np.int32)
+                        top_p = np.full((num_reqs, ), 0.8, dtype=np.float32)
+                        (temperature, top_k, top_p) = device_array(
+                            self.runner.mesh, (temperature, top_k, top_p),
+                            sharding=sampling_metadata_sharding)
+                    else:
+                        temperature = None
+                        top_k = None
+                        top_p = None
+                    sampling_metadata = TPUSupportedSamplingMetadata(
+                        temperature=temperature,
+                        top_k=top_k,
+                        top_p=top_p,
+                        do_sampling=do_sampling,
+                        logprobs=logprobs)
+                    self._run_compilation(
+                        f"worker{self.runner.rank} sample",
+                        sample,
+                        self.runner.rng_params_for_sampling,
+                        self.runner.mesh,
+                        logits,
+                        sampling_metadata,
+                        num_reqs=num_reqs,
+                        do_sampling=do_sampling,
+                    )
         self._sampling_precompiled = True
@@ -555,8 +557,16 @@ class CompilationManager:
         logger.info("Compiling gather_logprobs with different input shapes.")
         hsize = self.runner.model_config.get_vocab_size()
         for num_reqs in self.runner.num_reqs_paddings:
-            logits = self._create_dummy_tensor((num_reqs, hsize), jnp.bfloat16)
-            token_ids = self._create_dummy_tensor((num_reqs, ), jnp.int32)
+            logits_sharding = NamedSharding(
+                self.runner.mesh,
+                PartitionSpec(ShardingAxisName.MLP_DATA,
+                              ShardingAxisName.MLP_TENSOR))
+            token_ids_sharding = NamedSharding(
+                self.runner.mesh, PartitionSpec(ShardingAxisName.MLP_DATA, ))
+            logits = self._create_dummy_tensor((num_reqs, hsize), jnp.bfloat16,
+                                               logits_sharding)
+            token_ids = self._create_dummy_tensor((num_reqs, ), jnp.int32,
+                                                  token_ids_sharding)
             self._run_compilation(
                 f"worker{self.runner.rank} gather_logprobs",
                 self.runner._compute_and_gather_logprobs,

tpu-inference 0.12.0.dev20251222__py3-none-any.whl → 0.12.0.dev20251224__py3-none-any.whl

tpu-inference 0.12.0.dev20251222py3-none-any.whl → 0.12.0.dev20251224py3-none-any.whl