PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (67) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_utils.py +16 -24
tpu_inference/__init__.py +3 -22
tpu_inference/core/core_tpu.py +9 -17
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +11 -31
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +143 -287
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -7
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/{common → jax}/attention_interface.py +2 -8
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/{common → jax}/sharding.py +5 -5
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/__init__.py +3 -7
tpu_inference/layers/vllm/quantization/awq.py +3 -4
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +2 -4
tpu_inference/layers/vllm/quantization/unquantized.py +67 -62
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +12 -46
tpu_inference/models/jax/llama3.py +3 -4
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +2 -3
tpu_inference/models/jax/qwen2_5_vl.py +50 -165
tpu_inference/models/jax/qwen3.py +2 -3
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -32
tpu_inference/platforms/tpu_platform.py +34 -47
tpu_inference/runner/compilation_manager.py +60 -145
tpu_inference/runner/kv_cache.py +2 -2
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +135 -283
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +15 -38
tpu_inference/worker/tpu_worker.py +26 -163
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/RECORD +63 -61
tests/test_envs.py +0 -203
tpu_inference/layers/common/quant_methods.py +0 -8
tpu_inference/layers/vllm/quantization/mxfp4.py +0 -331
tpu_inference/models/jax/llama_guard_4.py +0 -361
/tpu_inference/layers/{common → jax}/binary_search.py +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/top_level.txt +0 -0

tpu_inference/models/jax/phi3.py ADDED Viewed

@@ -0,0 +1,376 @@
+from typing import List, Optional, Tuple
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from jax.sharding import Mesh
+from transformers import Phi3Config, modeling_flax_utils
+from vllm.config import VllmConfig
+from tpu_inference import utils
+from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.jax.attention_interface import attention
+from tpu_inference.layers.jax.rope_interface import apply_longrope, apply_rope
+from tpu_inference.logger import init_logger
+from tpu_inference.models.jax.utils.weight_utils import (MetadataMap,
+                                                         load_hf_weights)
+logger = init_logger(__name__)
+init_fn = nnx.initializers.uniform()
+class Phi3MLP(nnx.Module):
+    def __init__(self, config: Phi3Config, dtype: jnp.dtype, rng: nnx.Rngs):
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        act = config.hidden_act
+        self.gate_up_proj = nnx.Linear(
+            hidden_size,
+            2 * intermediate_size,
+            use_bias=False,
+            param_dtype=dtype,
+            kernel_init=nnx.with_partitioning(init_fn, (None, "model")),
+            rngs=rng,
+        )
+        self.down_proj = nnx.Linear(
+            intermediate_size,
+            hidden_size,
+            use_bias=False,
+            param_dtype=dtype,
+            kernel_init=nnx.with_partitioning(init_fn, ("model", None)),
+            rngs=rng,
+        )
+        self.act_fn = modeling_flax_utils.ACT2FN[act]
+    def __call__(self, x: jax.Array) -> jax.Array:
+        gate_up = self.gate_up_proj(x)
+        gate, up = jnp.split(gate_up, 2, axis=-1)
+        fuse = up * self.act_fn(gate)
+        result = self.down_proj(fuse)
+        return result
+class Phi3Attention(nnx.Module):
+    def __init__(self, config: Phi3Config, dtype: jnp.dtype, rng: nnx.Rngs,
+                 mesh: Mesh, kv_cache_dtype: str):
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = config.rope_scaling
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+        self.max_position_embeddings = config.max_position_embeddings
+        self.head_dim_original = getattr(config, "head_dim",
+                                         self.hidden_size // self.num_heads)
+        self.head_dim = utils.get_padded_head_dim(self.head_dim_original)
+        sharding_size = mesh.shape["model"]
+        self.num_heads = utils.get_padded_num_heads(self.num_heads,
+                                                    sharding_size)
+        self.num_kv_heads = utils.get_padded_num_heads(self.num_kv_heads,
+                                                       sharding_size)
+        self.mesh = mesh
+        self.qkv_proj = nnx.Einsum(
+            "TD,DNH->TNH",
+            (self.hidden_size, self.num_heads + self.num_kv_heads * 2,
+             self.head_dim),
+            param_dtype=dtype,
+            kernel_init=nnx.with_partitioning(init_fn, (None, "model", None)),
+            rngs=rng,
+        )
+        self.o_proj = nnx.Einsum(
+            "TNH,NHD->TD",
+            (self.num_heads, self.head_dim, self.hidden_size),
+            param_dtype=dtype,
+            kernel_init=nnx.with_partitioning(init_fn, ("model", None, None)),
+            rngs=rng,
+        )
+        self._q_scale = 1.0
+        self._k_scale = 1.0
+        self._v_scale = 1.0
+        self.kv_cache_quantized_dtype = None
+        if kv_cache_dtype != "auto":
+            self.kv_cache_quantized_dtype = utils.get_jax_dtype_from_str_dtype(
+                kv_cache_dtype)
+    def __call__(
+        self,
+        kv_cache: Optional[jax.Array],
+        x: jax.Array,
+        attention_metadata: AttentionMetadata,
+    ) -> Tuple[jax.Array, jax.Array]:
+        md = attention_metadata
+        # qkv: (T, N + K * 2, H)
+        qkv = self.qkv_proj(x)
+        q, k, v = jnp.split(
+            qkv, [self.num_heads, self.num_heads + self.num_kv_heads], axis=1)
+        if self.rope_scaling:
+            q = apply_longrope(q, md.input_positions, self.head_dim_original,
+                               self.rope_scaling,
+                               self.original_max_position_embeddings,
+                               self.max_position_embeddings, self.rope_theta)
+            k = apply_longrope(k, md.input_positions, self.head_dim_original,
+                               self.rope_scaling,
+                               self.original_max_position_embeddings,
+                               self.max_position_embeddings, self.rope_theta)
+        else:
+            q = apply_rope(q, md.input_positions, self.head_dim_original,
+                           self.rope_theta, self.rope_scaling)
+            k = apply_rope(k, md.input_positions, self.head_dim_original,
+                           self.rope_theta, self.rope_scaling)
+        # o: (T, N, H)
+        q_scale = k_scale = v_scale = None
+        if self.kv_cache_quantized_dtype:
+            # TODO(kyuyeunk/jacobplatin): Enable w8a8 when VREG spill issue is resolved.
+            # q_scale = self._q_scale
+            k_scale = self._k_scale
+            v_scale = self._v_scale
+            k, v = utils.quantize_kv(k, v, self.kv_cache_quantized_dtype,
+                                     k_scale, v_scale)
+        new_kv_cache, outputs = attention(
+            kv_cache,
+            q,
+            k,
+            v,
+            attention_metadata,
+            self.mesh,
+            self.head_dim_original,
+            q_scale=q_scale,
+            k_scale=k_scale,
+            v_scale=v_scale,
+        )
+        # (T, D)
+        o = self.o_proj(outputs)
+        return new_kv_cache, o
+class Phi3DecoderLayer(nnx.Module):
+    def __init__(self, config: Phi3Config, dtype: jnp.dtype, rng: nnx.Rngs,
+                 mesh: Mesh, kv_cache_dtype: str):
+        rms_norm_eps = config.rms_norm_eps
+        hidden_size = config.hidden_size
+        self.input_layernorm = nnx.RMSNorm(
+            hidden_size,
+            epsilon=rms_norm_eps,
+            param_dtype=dtype,
+            scale_init=nnx.with_partitioning(init_fn, (None, )),
+            rngs=rng,
+        )
+        self.self_attn = Phi3Attention(config=config,
+                                       dtype=dtype,
+                                       rng=rng,
+                                       mesh=mesh,
+                                       kv_cache_dtype=kv_cache_dtype)
+        self.post_attention_layernorm = nnx.RMSNorm(
+            hidden_size,
+            epsilon=rms_norm_eps,
+            param_dtype=dtype,
+            scale_init=nnx.with_partitioning(init_fn, (None, )),
+            rngs=rng,
+        )
+        self.mlp = Phi3MLP(
+            config=config,
+            dtype=dtype,
+            rng=rng,
+        )
+    def __call__(
+        self,
+        kv_cache: jax.Array,
+        x: jax.Array,
+        attention_metadata: AttentionMetadata,
+    ) -> Tuple[jax.Array, jax.Array]:
+        hidden_states = self.input_layernorm(x)
+        kv_cache, attn_output = self.self_attn(
+            kv_cache,
+            hidden_states,
+            attention_metadata,
+        )
+        attn_output += x
+        residual = attn_output
+        attn_output = self.post_attention_layernorm(attn_output)
+        outputs = self.mlp(attn_output)
+        outputs = residual + outputs
+        return kv_cache, outputs
+class Phi3Model(nnx.Module):
+    def __init__(self, vllm_config: VllmConfig, rng: nnx.Rngs,
+                 mesh: Mesh) -> None:
+        model_config = vllm_config.model_config
+        hf_config = model_config.hf_config
+        vocab_size = model_config.get_vocab_size()
+        dtype = model_config.dtype
+        rms_norm_eps = hf_config.rms_norm_eps
+        hidden_size = hf_config.hidden_size
+        self.embed = nnx.Embed(
+            num_embeddings=vocab_size,
+            features=hidden_size,
+            param_dtype=dtype,
+            embedding_init=nnx.with_partitioning(init_fn, ("model", None)),
+            rngs=rng,
+        )
+        self.layers = [
+            Phi3DecoderLayer(
+                config=hf_config,
+                dtype=dtype,
+                rng=rng,
+                mesh=mesh,
+                # TODO (jacobplatin): we should refactor this to pass a dtype (or config) directly
+                kv_cache_dtype=vllm_config.cache_config.cache_dtype)
+            for _ in range(hf_config.num_hidden_layers)
+        ]
+        self.norm = nnx.RMSNorm(
+            hidden_size,
+            epsilon=rms_norm_eps,
+            param_dtype=dtype,
+            scale_init=nnx.with_partitioning(init_fn, (None, )),
+            rngs=rng,
+        )
+        if model_config.hf_config.tie_word_embeddings:
+            self.lm_head = self.embed.embedding
+        else:
+            self.lm_head = nnx.Param(
+                init_fn(rng.params(), (hidden_size, vocab_size), dtype),
+                sharding=(None, "model"),
+            )
+    def __call__(
+        self,
+        kv_caches: List[jax.Array],
+        input_ids: jax.Array,
+        attention_metadata: AttentionMetadata,
+    ) -> Tuple[List[jax.Array], jax.Array]:
+        x = self.embed(input_ids)
+        for i, layer in enumerate(self.layers):
+            kv_cache = kv_caches[i]
+            kv_cache, x = layer(
+                kv_cache,
+                x,
+                attention_metadata,
+            )
+            kv_caches[i] = kv_cache
+        x = self.norm(x)
+        return kv_caches, x
+class Phi3ForCausalLM(nnx.Module):
+    def __init__(self, vllm_config: VllmConfig, rng_key: jax.Array,
+                 mesh: Mesh) -> None:
+        self.vllm_config = vllm_config
+        self.rng = nnx.Rngs(rng_key)
+        self.mesh = mesh
+        self.model = Phi3Model(
+            vllm_config=vllm_config,
+            rng=self.rng,
+            mesh=mesh,
+        )
+    def __call__(
+        self,
+        kv_caches: List[jax.Array],
+        input_ids: jax.Array,
+        attention_metadata: AttentionMetadata,
+        *args,
+    ) -> Tuple[List[jax.Array], jax.Array, List[jax.Array]]:
+        kv_caches, x = self.model(
+            kv_caches,
+            input_ids,
+            attention_metadata,
+        )
+        return kv_caches, x, []
+    def compute_logits(self, hidden_states: jax.Array) -> jax.Array:
+        if self.vllm_config.model_config.hf_config.tie_word_embeddings:
+            logits = jnp.dot(hidden_states, self.model.lm_head.value.T)
+        else:
+            logits = jnp.dot(hidden_states, self.model.lm_head.value)
+        return logits
+    def get_metadata_map(self) -> MetadataMap:
+        sharding_size = self.mesh.shape["model"]
+        model_config = self.vllm_config.model_config
+        hf_config = model_config.hf_config
+        num_heads = hf_config.num_attention_heads
+        num_kv_heads = hf_config.num_key_value_heads
+        qkv_heads = num_heads + num_kv_heads * 2
+        hidden_size = model_config.get_hidden_size()
+        # Pad head_dim for kernel performance.
+        head_dim_original = model_config.get_head_size()
+        # Key: path to a HF layer weight
+        # Value: path to a nnx layer weight
+        name_map = {
+            "model.embed_tokens": "model.embed.embedding",
+            "model.layers.*.input_layernorm":
+            "model.layers.*.input_layernorm.scale",
+            "model.layers.*.mlp.down_proj":
+            "model.layers.*.mlp.down_proj.kernel",
+            "model.layers.*.mlp.gate_up_proj":
+            "model.layers.*.mlp.gate_up_proj.kernel",
+            "model.layers.*.post_attention_layernorm":
+            "model.layers.*.post_attention_layernorm.scale",
+            "model.layers.*.self_attn.qkv_proj":
+            "model.layers.*.self_attn.qkv_proj.kernel",
+            "model.layers.*.self_attn.o_proj":
+            "model.layers.*.self_attn.o_proj.kernel",
+            "model.norm": "model.norm.scale",
+        }
+        if not self.vllm_config.model_config.hf_config.tie_word_embeddings:
+            name_map.update({
+                "lm_head": "model.lm_head",
+            })
+        reshape_keys: dict[str, tuple[int, ...]] = {
+            "qkv_proj": (qkv_heads, head_dim_original, hidden_size),
+            "o_proj": (hidden_size, num_heads, head_dim_original),
+        }
+        transpose_keys: dict[str, tuple[int, ...]] = {
+            "lm_head": (1, 0),
+            "gate_up_proj": (1, 0),
+            "down_proj": (1, 0),
+            "qkv_proj": (2, 0, 1),
+            "o_proj": (1, 2, 0),
+        }
+        # key: (padding_dim, padding_size)
+        pad_keys: dict[str, tuple[int, ...]] = {
+            "qkv_proj": (1, sharding_size // num_heads),
+            "o_proj": (0, sharding_size // num_heads),
+        }
+        return MetadataMap(name_map=name_map,
+                           reshape_map=reshape_keys,
+                           bias_reshape_map={},
+                           transpose_map=transpose_keys,
+                           pad_map=pad_keys,
+                           bias_pad_map={})
+    def load_weights(self, rng_key: jax.Array):
+        # NOTE: Since we are using nnx.eval_shape to init the model,
+        # we have to pass dynamic arrays here for __call__'s usage.
+        self.rng = nnx.Rngs(rng_key)
+        metadata_map = self.get_metadata_map()
+        load_hf_weights(vllm_config=self.vllm_config,
+                        model=self,
+                        metadata_map=metadata_map,
+                        mesh=self.mesh)

tpu_inference/models/jax/qwen2.py CHANGED Viewed

@@ -8,8 +8,8 @@ from transformers import Qwen2Config, modeling_flax_utils
 from vllm.config import VllmConfig
 from tpu_inference import utils
-from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.layers.jax.rope_interface import apply_rope
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.utils.weight_utils import (get_default_maps,
@@ -368,8 +368,7 @@ class Qwen2ForCausalLM(nnx.Module):
                 "lm_head": "model.lm_head",
             })
-        metadata_map = get_default_maps(self.vllm_config.model_config,
-                                        self.mesh, mappings)
+        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
         load_hf_weights(vllm_config=self.vllm_config,
                         model=self,
                         metadata_map=metadata_map,

tpu_inference/models/jax/qwen2_5_vl.py CHANGED Viewed

@@ -14,9 +14,9 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
 from vllm.config import VllmConfig
 from tpu_inference import utils as utils
-from tpu_inference.layers.common.attention_interface import \
-    sharded_flash_attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.jax.attention_interface import \
+    sharded_flash_attention
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.qwen2 import Qwen2ForCausalLM
 # from vllm.model_executor.models.interfaces import MultiModalEmbeddings
@@ -486,11 +486,6 @@ class Qwen2_5_VisionTransformer(nnx.Module):
             dtype=dtype,
             rngs=rngs)
-        additional_config = getattr(vllm_config, "additional_config",
-                                    None) or {}
-        self.enable_dynamic_image_sizes = additional_config.get(
-            "enable_dynamic_image_sizes", False)
     def rotary_pos_emb_thw(self, t, h, w):
         hpos_ids, wpos_ids = jnp.indices((h, w))
         hpos_ids = hpos_ids.reshape(
@@ -584,7 +579,21 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
-    def compute_aux_arrays(self, grid_thw: tuple[tuple[int, int, int]]):
+    def __call__(self, x: jax.Array, grid_thw: tuple[tuple[int, int,
+                                                           int]]) -> jax.Array:
+        # x: pixel_values: jax.Array
+        # """Shape:
+        # `(num_patches, num_channels * patch_size * patch_size)`
+        # """
+        # grid_thw: image_grid_thw: jax.Array
+        # """Shape: `(num_images, 3)`
+        # This should be in `(grid_t, grid_h, grid_w)` format.
+        # """
+        hidden_states = self.patch_embed(x)
+        # num of patches
+        seq_len = x.shape[0]
         # num of images/videoes
         num_grids = len(grid_thw)
@@ -629,42 +638,6 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         cu_seqlens = jnp.pad(cu_seqlens, ((1, 0), ),
                              mode='constant',
                              constant_values=0)
-        return window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens
-    def pad_inputs(self, x, window_index, rotary_pos_emb, cu_seqlens,
-                   cu_window_seqlens):
-        # padding
-        num_patches = int(rotary_pos_emb.shape[0])
-        bucket_num_patches = 1 << (num_patches - 1).bit_length()
-        num_tokens = window_index.shape[0]
-        bucket_num_tokens = bucket_num_patches // self.spatial_merge_unit
-        vit_merger_window_size = (self.window_size //
-                                  self.spatial_merge_size // self.patch_size)
-        max_windows = (bucket_num_tokens // vit_merger_window_size) + 2
-        rotary_pos_emb = jnp.pad(rotary_pos_emb,
-                                 ((0, bucket_num_patches - num_patches),
-                                  (0, 0)))
-        window_index = jnp.concatenate([
-            window_index,
-            jnp.arange(num_tokens, bucket_num_tokens, dtype=jnp.int32)
-        ])
-        cu_window_seqlens = jnp.append(cu_window_seqlens, bucket_num_patches)
-        pad_w = max(0, max_windows + 1 - cu_window_seqlens.shape[0])
-        cu_window_seqlens = jnp.pad(cu_window_seqlens, (0, pad_w), mode='edge')
-        cu_seqlens = jnp.append(cu_seqlens, bucket_num_patches)
-        x_padded = jnp.pad(x, ((0, bucket_num_patches - x.shape[0]), (0, 0)))
-        return x_padded, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens, num_tokens
-    def compute_hidden_states(self, x: jax.Array, window_index: jax.Array,
-                              rotary_pos_emb: jax.Array, cu_seqlens: jax.Array,
-                              cu_window_seqlens: jax.Array) -> jax.Array:
-        hidden_states = self.patch_embed(x)
-        # num of patches
-        seq_len = x.shape[0]
         hidden_states = hidden_states.reshape(
             seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
@@ -691,48 +664,6 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         hidden_states = hidden_states[reverse_indices, :]
         return hidden_states
-    @jax.jit
-    def encode_padded_jit(self, x_padded, window_index, rotary_pos_emb,
-                          cu_seqlens, cu_window_seqlens):
-        return self.compute_hidden_states(x_padded, window_index,
-                                          rotary_pos_emb, cu_seqlens,
-                                          cu_window_seqlens)
-    @partial(
-        jax.jit,
-        static_argnames=("grid_thw", ),
-    )
-    def encode_jit(self, x, grid_thw):
-        window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens = self.compute_aux_arrays(
-            grid_thw)
-        return self.compute_hidden_states(x, window_index, rotary_pos_emb,
-                                          cu_seqlens, cu_window_seqlens)
-    def __call__(self, x: jax.Array, grid_thw: tuple[tuple[int, int,
-                                                           int]]) -> jax.Array:
-        # x: pixel_values: jax.Array
-        # """Shape:
-        # `(num_patches, num_channels * patch_size * patch_size)`
-        # """
-        # grid_thw: image_grid_thw: jax.Array
-        # """Shape: `(num_images, 3)`
-        # This should be in `(grid_t, grid_h, grid_w)` format.
-        # """
-        if self.enable_dynamic_image_sizes:
-            window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens = self.compute_aux_arrays(
-                grid_thw)
-            x_padded, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens, num_tokens = self.pad_inputs(
-                x, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens)
-            hidden_states = self.encode_padded_jit(x_padded, window_index,
-                                                   rotary_pos_emb, cu_seqlens,
-                                                   cu_window_seqlens)
-            return hidden_states[:num_tokens]
-        else:
-            return self.encode_jit(x, grid_thw)
 class Qwen2_5_VLForConditionalGeneration(nnx.Module):
@@ -957,6 +888,10 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
             #         "video"] = self._parse_and_validate_video_input(**kwargs)
         return mm_input_by_modality
+    @partial(
+        jax.jit,
+        static_argnames=("image_grid_thw", ),
+    )
     def get_single_image_embedding(self, image_pixel_values, image_grid_thw):
         return self.visual(image_pixel_values, (image_grid_thw, ))
@@ -1126,8 +1061,7 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
                 "lm_head": "language_model.model.lm_head",
             })
-        metadata_map = get_default_maps(self.vllm_config.model_config,
-                                        self.mesh, mappings)
+        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
         load_hf_weights(vllm_config=self.vllm_config,
                         model=self,
                         metadata_map=metadata_map,
@@ -1137,82 +1071,33 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
         self,
         run_compilation_fn: Callable,
     ) -> None:
+        image_shapes = []
+        if (warmup_config := self.vllm_config.additional_config.get(
+                "vision_warmup_config")):
+            image_shapes = warmup_config.get("image_shapes")
         vc = self.vllm_config.model_config.hf_config.vision_config
-        patch_input_dim = vc.in_channels * vc.temporal_patch_size * vc.patch_size * vc.patch_size
-        if self.visual.enable_dynamic_image_sizes:
-            spatial_merge_unit = vc.spatial_merge_size**2
-            max_num_batched_tokens = self.vllm_config.scheduler_config.max_num_batched_tokens
-            mm_kwargs = self.vllm_config.model_config.multimodal_config.mm_processor_kwargs or {}
-            limit_pixels = float(mm_kwargs.get("max_pixels", float('inf')))
-            max_patches = int(
-                min(max_num_batched_tokens * spatial_merge_unit,
-                    limit_pixels / (vc.patch_size**2)))
-            num_patches_paddings = [
-                1 << i for i in range(4, (max_patches - 1).bit_length() + 1)
-            ]
-            rotary_dim = vc.hidden_size // vc.num_heads // 2
-            vit_merger_window_size = (vc.window_size //
-                                      vc.spatial_merge_size // vc.patch_size)
-            for num_patches in num_patches_paddings:
-                dummy_x_padded = jnp.ones(
-                    (num_patches, patch_input_dim),
-                    dtype=self.vllm_config.model_config.dtype)
-                num_tokens = num_patches // spatial_merge_unit
-                dummy_window_index = jnp.arange(num_tokens, dtype=jnp.int32)
-                dummy_rotary_pos_emb = jnp.ones(
-                    (num_patches, rotary_dim),
-                    dtype=self.vllm_config.model_config.dtype)
-                dummy_cu_seqlens = jnp.array([0, num_patches, num_patches],
-                                             dtype=jnp.int32)
-                max_windows = (num_tokens // vit_merger_window_size) + 2
-                patches_per_window = (vit_merger_window_size**
-                                      2) * spatial_merge_unit
-                dummy_cu_window_seqlens = jnp.arange(
-                    max_windows + 1, dtype=jnp.int32) * patches_per_window
-                dummy_cu_window_seqlens = jnp.minimum(dummy_cu_window_seqlens,
-                                                      num_patches)
-                run_compilation_fn("vision_encoder_padded",
-                                   self.visual.encode_padded_jit,
-                                   dummy_x_padded,
-                                   dummy_window_index,
-                                   dummy_rotary_pos_emb,
-                                   dummy_cu_seqlens,
-                                   dummy_cu_window_seqlens,
-                                   num_patches=num_patches)
-        else:
-            image_shapes = []
-            if (warmup_config := self.vllm_config.additional_config.get(
-                    "vision_warmup_config")):
-                image_shapes = warmup_config.get("image_shapes")
-            factor = vc.patch_size * vc.spatial_merge_size
-            for input_hw in image_shapes:
-                if not isinstance(input_hw, list) or len(input_hw) != 2:
-                    logger.warning(f"Skipping invalid shape {input_hw}.")
-                    continue
-                h_input, w_input = input_hw
-                h_processed = round(h_input / factor) * factor
-                w_processed = round(w_input / factor) * factor
-                t, h, w = 1, h_processed // vc.patch_size, w_processed // vc.patch_size
-                grid_thw = (t, h, w)
-                num_patches = t * h * w
-                dummy_pixel_values = jnp.ones(
-                    (num_patches, patch_input_dim),
-                    self.vllm_config.model_config.dtype,
-                )
-                dummy_grid_thw = (grid_thw, )
+        factor = vc.patch_size * vc.spatial_merge_size
+        for input_hw in image_shapes:
+            if not isinstance(input_hw, list) or len(input_hw) != 2:
+                logger.warning(f"Skipping invalid shape {input_hw}.")
+                continue
+            h_input, w_input = input_hw
+            h_processed = round(h_input / factor) * factor
+            w_processed = round(w_input / factor) * factor
+            t, h, w = 1, h_processed // vc.patch_size, w_processed // vc.patch_size
+            grid_thw = (t, h, w)
+            num_patches = t * h * w
+            patch_input_dim = vc.in_channels * vc.temporal_patch_size * vc.patch_size * vc.patch_size
+            dummy_pixel_values = jnp.ones(
+                (num_patches, patch_input_dim),
+                self.vllm_config.model_config.dtype,
+            )
+            dummy_grid_thw = grid_thw
-                run_compilation_fn("vision_encoder",
-                                   self.visual.encode_jit,
-                                   dummy_pixel_values,
-                                   dummy_grid_thw,
-                                   image_shape=input_hw)
+            run_compilation_fn("single_image_encoder",
+                               self.get_single_image_embedding,
+                               dummy_pixel_values,
+                               dummy_grid_thw,
+                               image_shape=input_hw)

tpu_inference/models/jax/qwen3.py CHANGED Viewed

@@ -8,8 +8,8 @@ from transformers import Qwen3Config
 from vllm.config import VllmConfig
 from tpu_inference import utils
-from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.jax.attention_interface import attention
 from tpu_inference.layers.jax.rope_interface import apply_rope
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.qwen2 import Qwen2DecoderLayer
@@ -295,8 +295,7 @@ class Qwen3ForCausalLM(nnx.Module):
                 "lm_head": "model.lm_head",
             })
-        metadata_map = get_default_maps(self.vllm_config.model_config,
-                                        self.mesh, mappings)
+        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
         load_hf_weights(vllm_config=self.vllm_config,
                         model=self,
                         metadata_map=metadata_map,

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl