PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (257) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +317 -34
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +26 -6
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +25 -4
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +807 -230
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +218 -137
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +25 -12
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +32 -9
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +101 -494
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +23 -8
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +172 -176
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +42 -25
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -178
tpu_inference/layers/vllm/quantization/unquantized.py +157 -233
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +112 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +18 -5
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +179 -51
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +92 -32
tpu_inference/models/jax/utils/weight_utils.py +234 -155
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +32 -8
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +51 -72
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +180 -80
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +55 -33
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +16 -3
tpu_inference/runner/tpu_runner.py +124 -61
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +84 -22
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +66 -52
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +8 -9
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -186
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511220812.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/models/jax/qwen2_5_vl.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from functools import partial
 from typing import (Callable, List, Literal, NamedTuple, Optional, TypedDict,
@@ -486,6 +500,11 @@ class Qwen2_5_VisionTransformer(nnx.Module):
             dtype=dtype,
             rngs=rngs)
+        additional_config = getattr(vllm_config, "additional_config",
+                                    None) or {}
+        self.enable_dynamic_image_sizes = additional_config.get(
+            "enable_dynamic_image_sizes", False)
     def rotary_pos_emb_thw(self, t, h, w):
         hpos_ids, wpos_ids = jnp.indices((h, w))
         hpos_ids = hpos_ids.reshape(
@@ -579,21 +598,7 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
-    def __call__(self, x: jax.Array, grid_thw: tuple[tuple[int, int,
-                                                           int]]) -> jax.Array:
-        # x: pixel_values: jax.Array
-        # """Shape:
-        # `(num_patches, num_channels * patch_size * patch_size)`
-        # """
-        # grid_thw: image_grid_thw: jax.Array
-        # """Shape: `(num_images, 3)`
-        # This should be in `(grid_t, grid_h, grid_w)` format.
-        # """
-        hidden_states = self.patch_embed(x)
-        # num of patches
-        seq_len = x.shape[0]
+    def compute_aux_arrays(self, grid_thw: tuple[tuple[int, int, int]]):
         # num of images/videoes
         num_grids = len(grid_thw)
@@ -638,6 +643,42 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         cu_seqlens = jnp.pad(cu_seqlens, ((1, 0), ),
                              mode='constant',
                              constant_values=0)
+        return window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens
+    def pad_inputs(self, x, window_index, rotary_pos_emb, cu_seqlens,
+                   cu_window_seqlens):
+        # padding
+        num_patches = int(rotary_pos_emb.shape[0])
+        bucket_num_patches = 1 << (num_patches - 1).bit_length()
+        num_tokens = window_index.shape[0]
+        bucket_num_tokens = bucket_num_patches // self.spatial_merge_unit
+        vit_merger_window_size = (self.window_size //
+                                  self.spatial_merge_size // self.patch_size)
+        max_windows = (bucket_num_tokens // vit_merger_window_size) + 2
+        rotary_pos_emb = jnp.pad(rotary_pos_emb,
+                                 ((0, bucket_num_patches - num_patches),
+                                  (0, 0)))
+        window_index = jnp.concatenate([
+            window_index,
+            jnp.arange(num_tokens, bucket_num_tokens, dtype=jnp.int32)
+        ])
+        cu_window_seqlens = jnp.append(cu_window_seqlens, bucket_num_patches)
+        pad_w = max(0, max_windows + 1 - cu_window_seqlens.shape[0])
+        cu_window_seqlens = jnp.pad(cu_window_seqlens, (0, pad_w), mode='edge')
+        cu_seqlens = jnp.append(cu_seqlens, bucket_num_patches)
+        x_padded = jnp.pad(x, ((0, bucket_num_patches - x.shape[0]), (0, 0)))
+        return x_padded, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens, num_tokens
+    def compute_hidden_states(self, x: jax.Array, window_index: jax.Array,
+                              rotary_pos_emb: jax.Array, cu_seqlens: jax.Array,
+                              cu_window_seqlens: jax.Array) -> jax.Array:
+        hidden_states = self.patch_embed(x)
+        # num of patches
+        seq_len = x.shape[0]
         hidden_states = hidden_states.reshape(
             seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
@@ -664,6 +705,48 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         hidden_states = hidden_states[reverse_indices, :]
         return hidden_states
+    @jax.jit
+    def encode_padded_jit(self, x_padded, window_index, rotary_pos_emb,
+                          cu_seqlens, cu_window_seqlens):
+        return self.compute_hidden_states(x_padded, window_index,
+                                          rotary_pos_emb, cu_seqlens,
+                                          cu_window_seqlens)
+    @partial(
+        jax.jit,
+        static_argnames=("grid_thw", ),
+    )
+    def encode_jit(self, x, grid_thw):
+        window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens = self.compute_aux_arrays(
+            grid_thw)
+        return self.compute_hidden_states(x, window_index, rotary_pos_emb,
+                                          cu_seqlens, cu_window_seqlens)
+    def __call__(self, x: jax.Array, grid_thw: tuple[tuple[int, int,
+                                                           int]]) -> jax.Array:
+        # x: pixel_values: jax.Array
+        # """Shape:
+        # `(num_patches, num_channels * patch_size * patch_size)`
+        # """
+        # grid_thw: image_grid_thw: jax.Array
+        # """Shape: `(num_images, 3)`
+        # This should be in `(grid_t, grid_h, grid_w)` format.
+        # """
+        if self.enable_dynamic_image_sizes:
+            window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens = self.compute_aux_arrays(
+                grid_thw)
+            x_padded, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens, num_tokens = self.pad_inputs(
+                x, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens)
+            hidden_states = self.encode_padded_jit(x_padded, window_index,
+                                                   rotary_pos_emb, cu_seqlens,
+                                                   cu_window_seqlens)
+            return hidden_states[:num_tokens]
+        else:
+            return self.encode_jit(x, grid_thw)
 class Qwen2_5_VLForConditionalGeneration(nnx.Module):
@@ -888,10 +971,6 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
             #         "video"] = self._parse_and_validate_video_input(**kwargs)
         return mm_input_by_modality
-    @partial(
-        jax.jit,
-        static_argnames=("image_grid_thw", ),
-    )
     def get_single_image_embedding(self, image_pixel_values, image_grid_thw):
         return self.visual(image_pixel_values, (image_grid_thw, ))
@@ -931,9 +1010,9 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
         split_indices = np.cumsum(sizes)[:-1]
         return tuple(jnp.split(image_embeds, split_indices))
-    def get_multimodal_embeddings(self, image_grid_thw: tuple[tuple[int, int,
-                                                                    int], ...],
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def embed_multimodal(self, image_grid_thw: tuple[tuple[int, int, int],
+                                                     ...],
+                         **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
             image_grid_thw, **kwargs)
@@ -957,7 +1036,7 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
         return multimodal_embeddings
-    def get_input_embeddings(
+    def embed_input_ids(
             self, input_ids: jax.Array,
             multimodal_embeddings: Optional[jax.Array]) -> jax.Array:
@@ -1072,33 +1151,82 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
         self,
         run_compilation_fn: Callable,
     ) -> None:
-        image_shapes = []
-        if (warmup_config := self.vllm_config.additional_config.get(
-                "vision_warmup_config")):
-            image_shapes = warmup_config.get("image_shapes")
         vc = self.vllm_config.model_config.hf_config.vision_config
-        factor = vc.patch_size * vc.spatial_merge_size
-        for input_hw in image_shapes:
-            if not isinstance(input_hw, list) or len(input_hw) != 2:
-                logger.warning(f"Skipping invalid shape {input_hw}.")
-                continue
-            h_input, w_input = input_hw
-            h_processed = round(h_input / factor) * factor
-            w_processed = round(w_input / factor) * factor
-            t, h, w = 1, h_processed // vc.patch_size, w_processed // vc.patch_size
-            grid_thw = (t, h, w)
-            num_patches = t * h * w
-            patch_input_dim = vc.in_channels * vc.temporal_patch_size * vc.patch_size * vc.patch_size
-            dummy_pixel_values = jnp.ones(
-                (num_patches, patch_input_dim),
-                self.vllm_config.model_config.dtype,
-            )
-            dummy_grid_thw = grid_thw
+        patch_input_dim = vc.in_channels * vc.temporal_patch_size * vc.patch_size * vc.patch_size
+        if self.visual.enable_dynamic_image_sizes:
+            spatial_merge_unit = vc.spatial_merge_size**2
+            max_num_batched_tokens = self.vllm_config.scheduler_config.max_num_batched_tokens
+            mm_kwargs = self.vllm_config.model_config.multimodal_config.mm_processor_kwargs or {}
+            limit_pixels = float(mm_kwargs.get("max_pixels", float('inf')))
+            max_patches = int(
+                min(max_num_batched_tokens * spatial_merge_unit,
+                    limit_pixels / (vc.patch_size**2)))
+            num_patches_paddings = [
+                1 << i for i in range(4, (max_patches - 1).bit_length() + 1)
+            ]
+            rotary_dim = vc.hidden_size // vc.num_heads // 2
+            vit_merger_window_size = (vc.window_size //
+                                      vc.spatial_merge_size // vc.patch_size)
+            for num_patches in num_patches_paddings:
+                dummy_x_padded = jnp.ones(
+                    (num_patches, patch_input_dim),
+                    dtype=self.vllm_config.model_config.dtype)
+                num_tokens = num_patches // spatial_merge_unit
+                dummy_window_index = jnp.arange(num_tokens, dtype=jnp.int32)
+                dummy_rotary_pos_emb = jnp.ones(
+                    (num_patches, rotary_dim),
+                    dtype=self.vllm_config.model_config.dtype)
+                dummy_cu_seqlens = jnp.array([0, num_patches, num_patches],
+                                             dtype=jnp.int32)
+                max_windows = (num_tokens // vit_merger_window_size) + 2
+                patches_per_window = (vit_merger_window_size**
+                                      2) * spatial_merge_unit
+                dummy_cu_window_seqlens = jnp.arange(
+                    max_windows + 1, dtype=jnp.int32) * patches_per_window
+                dummy_cu_window_seqlens = jnp.minimum(dummy_cu_window_seqlens,
+                                                      num_patches)
+                run_compilation_fn("vision_encoder_padded",
+                                   self.visual.encode_padded_jit,
+                                   dummy_x_padded,
+                                   dummy_window_index,
+                                   dummy_rotary_pos_emb,
+                                   dummy_cu_seqlens,
+                                   dummy_cu_window_seqlens,
+                                   num_patches=num_patches)
+        else:
+            image_shapes = []
+            if (warmup_config := self.vllm_config.additional_config.get(
+                    "vision_warmup_config")):
+                image_shapes = warmup_config.get("image_shapes")
+            factor = vc.patch_size * vc.spatial_merge_size
+            for input_hw in image_shapes:
+                if not isinstance(input_hw, list) or len(input_hw) != 2:
+                    logger.warning(f"Skipping invalid shape {input_hw}.")
+                    continue
+                h_input, w_input = input_hw
+                h_processed = round(h_input / factor) * factor
+                w_processed = round(w_input / factor) * factor
+                t, h, w = 1, h_processed // vc.patch_size, w_processed // vc.patch_size
+                grid_thw = (t, h, w)
+                num_patches = t * h * w
+                dummy_pixel_values = jnp.ones(
+                    (num_patches, patch_input_dim),
+                    self.vllm_config.model_config.dtype,
+                )
+                dummy_grid_thw = (grid_thw, )
-            run_compilation_fn("single_image_encoder",
-                               self.get_single_image_embedding,
-                               dummy_pixel_values,
-                               dummy_grid_thw,
-                               image_shape=input_hw)
+                run_compilation_fn("vision_encoder",
+                                   self.visual.encode_jit,
+                                   dummy_pixel_values,
+                                   dummy_grid_thw,
+                                   image_shape=input_hw)

tpu_inference/models/jax/qwen3.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import List, Optional, Tuple
 import jax
@@ -10,6 +24,7 @@ from vllm.config import VllmConfig
 from tpu_inference import utils
 from tpu_inference.layers.common.attention_interface import attention
 from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.common.quantization import quantize_kv
 from tpu_inference.layers.jax.rope_interface import apply_rope
 from tpu_inference.logger import init_logger
 from tpu_inference.models.jax.qwen2 import Qwen2DecoderLayer
@@ -125,8 +140,8 @@ class Qwen3Attention(nnx.Module):
             # q_scale = self._q_scale
             k_scale = self._k_scale
             v_scale = self._v_scale
-            k, v = utils.quantize_kv(k, v, self.kv_cache_quantized_dtype,
-                                     k_scale, v_scale)
+            k, v = quantize_kv(self.kv_cache_quantized_dtype, k, v, k_scale,
+                               v_scale)
         new_kv_cache, outputs = attention(
             kv_cache,
             q,

tpu_inference/models/jax/utils/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/models/jax/utils/file_utils.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import glob
 import hashlib
 import os

tpu_inference/models/jax/utils/multi_modal_utils.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Union
 import jax
@@ -29,25 +43,25 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
     """
     Perform sanity checks for the result of
-    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
+    [`vllm.model_executor.models.SupportsMultiModal.embed_multimodal`][].
     """
     assert isinstance(mm_embeddings, (list, tuple, jax.Array)), (
         "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
         f"or a single 3D tensor, but got {type(mm_embeddings)} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method.")
     assert len(mm_embeddings) == expected_num_items, (
         "Expected number of multimodal embeddings to match number of "
         f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method.")
     assert all(e.ndim == 2 for e in mm_embeddings), (
         "Expected multimodal embeddings to be a sequence of 2D tensors, "
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
-        "of the model's `get_multimodal_embeddings` method.")
+        "of the model's `embed_multimodal` method.")
 def flatten_embeddings(embeddings: NestedTensors) -> jax.Array:

tpu_inference/models/jax/utils/qwix/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} RENAMED Viewed

@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import functools
 import os
 from typing import TYPE_CHECKING, Callable, List
@@ -34,17 +35,43 @@ DEFAULT_NUM_TOKENS_FOR_MODEL_INPUTS = 512
 DEFAULT_MAX_NUM_SEQS_FOR_MODEL_INPUTS = 256
 DEFAULT_MAX_NUM_BLOCKS_PER_REQ = 16
-DEFAULT_DEEPSEEK_FP8_CONFIG = {
+DEFAULT_DEEPSEEK_FP4_MLP_MOE_FP8_ATTN_CONFIG = {
     "qwix": {
         "use_abstract_model":
         True,
         "scale_dtype":
         "bfloat16",
         "rules": [
+            # Exclude router from quantization
             {
                 "module_path": ".*.custom_module.router.*",
                 "weight_qtype": None,
             },
+            # Avoid the combine expert ops
+            {
+                "module_path": ".*combine_experts.*",
+                "weight_qtype": None,
+            },
+            # Attention layers: keep FP8 for weights and activations
+            {
+                "module_path": ".*.attn.*",
+                "weight_qtype": "float8_e4m3fn",
+                "act_qtype": "float8_e4m3fn",
+            },
+            # MoE experts: use FP4 for expert weights
+            {
+                "module_path": ".*.custom_module.*",
+                "weight_qtype": "float4_e2m1fn",
+                "act_qtype": "float8_e4m3fn",
+                "tile_size": 256,
+            },
+            # Shared experts: also FP4
+            {
+                "module_path": ".*.shared_experts.*",
+                "weight_qtype": "float4_e2m1fn",
+                "act_qtype": "float8_e4m3fn",
+                "tile_size": 256,
+            },
             {
                 "module_path": ".*",
                 "weight_qtype": "float8_e4m3fn",
@@ -154,12 +181,9 @@ def qwix_quantize_nnx_model(model: nnx.Module, qwix_config: List[dict],
     logger.info(f"Memory usage before applying quantization of params: "
                 f"hbm={utils.hbm_usage_gb(jax.local_devices())}Gb")
-    # TODO (jacobplatin): we should refactor this to pass a dtype (or config) directly
-    kv_cache_jnp_dtype = utils.get_jax_dtype_from_str_dtype(kv_cache_dtype)
-    # Handle the case where kv_cache_dtype is "auto"
-    if kv_cache_jnp_dtype is None:
-        assert kv_cache_dtype == "auto", "kv_cache_dtype must be 'auto' if kv_cache_jnp_dtype is None"
+    if kv_cache_dtype != "auto":
+        kv_cache_jnp_dtype = utils.to_jax_dtype(kv_cache_dtype)
+    else:
         kv_cache_jnp_dtype = DEFAULT_KV_CACHE_DTYPE
     kv_caches = create_kv_caches(
@@ -169,9 +193,11 @@ def qwix_quantize_nnx_model(model: nnx.Module, qwix_config: List[dict],
         head_size=kv_cache_head_size,
         mesh=mesh,
         layer_names=[f"layer.{i}" for i in range(num_hidden_layers)],
-        cache_dtype=kv_cache_jnp_dtype)
+        cache_dtype=kv_cache_jnp_dtype,
+        use_mla=model.vllm_config.model_config.use_mla,
+    )
-    dp_size = mesh.shape.get("data", 1) * mesh.shape.get("attn", 1)
+    dp_size = model.vllm_config.sharding_config.total_dp_size
     # NOTE: the inputs don't need to match the actual ones, as long as the consumed weights are the same
     input_ids = jax.random.randint(rng,
@@ -399,8 +425,7 @@ def apply_qwix_on_abstract_model(vllm_config: "VllmConfig") -> bool:
 def get_default_qwix_quantization_config(
-        model_type: str, quant_method: str,
-        skip_quantization: bool) -> dict | None:
+        hf_config: dict, skip_quantization: bool) -> dict | None:
     """
     Some models are pre-quantized and in those cases, we want to return a default set of
     Qwix quantization rules (instead of forcing the user to pass in a quantization config each time).
@@ -418,9 +443,42 @@ def get_default_qwix_quantization_config(
     """
     if skip_quantization:
         return None
-    # TODO (jacobplatin): remove this so that we can support various quantization types
+    model_type = hf_config.model_type.lower() if hasattr(
+        hf_config, "model_type") else None
+    quant_method = hf_config.quantization_config["quant_method"] if hasattr(
+        hf_config, "quantization_config") else None
+    # TODO (jacobplatin): remove this so that we can support various quantization types + make
+    # more flexible
+    # NOTE (jacobplatin): we'll default to mixed FP8 (attention) + FP4 (MoE experts)
+    # for DeepSeek
     if model_type == "deepseek_v3" and quant_method == "fp8":
-        return DEFAULT_DEEPSEEK_FP8_CONFIG
+        config = copy.deepcopy(DEFAULT_DEEPSEEK_FP4_MLP_MOE_FP8_ATTN_CONFIG)
+        # Dynamically fetch block size from HF config if available
+        # Config fmt: 'weight_block_size': [1, 512] -> we want the 2nd dim for tile_size
+        # NOTE: if the checkpoint is not 1D subchannel, we will throw an error
+        hf_quant_config = hf_config.quantization_config
+        assert "weight_block_size" in hf_quant_config, "Expected weight_block_size in quantization_config"
+        block_size = hf_quant_config["weight_block_size"]
+        if isinstance(block_size, (list, tuple)) and len(block_size) == 2:
+            assert block_size[
+                0] == 1, f"Expected first dimension to be 1 (unchanneled), but got {block_size[0]}! If you are trying to run quantized DeepSeek, we currently only support 1D-subchannel quantization and those models can be found here: https://huggingface.co/collections/jrplatin/deepseek-r1-1d-subchannel"
+            tile_size = block_size[1]
+            assert tile_size > 1, f"Expected tile_size > 1 for DeepSeek, but got {tile_size}"
+            logger.info(
+                f"Detected DeepSeek tile_size from config: {tile_size}")
+            # Update tile_size in the rules, since we might not always use a 1D subchannel size of
+            # 256
+            for rule in config["qwix"]["rules"]:
+                if "tile_size" in rule:
+                    rule["tile_size"] = tile_size
+        else:
+            raise ValueError(
+                f"Invalid weight_block_size config: {block_size}, expected a list/tuple of length 2"
+            )
+        return config
     elif model_type == "llama4" and quant_method == "compressed-tensors":
         return DEFAULT_LLAMA4_FP8_CONFIG
     # MXFP4 (GPT-OSS): provide a default configuration to quantize MoE experts via Qwix
@@ -439,14 +497,10 @@ def update_vllm_config_for_qwix_quantization(vllm_config: "VllmConfig"):
     # Qwix quantization config accordingly
     # NOTE: if a Qwix config is provided (via the`additional_config`), we'll
     # use that instead
-    model_type = vllm_config.model_config.hf_config.model_type.lower(
-    ) if hasattr(vllm_config.model_config.hf_config, "model_type") else None
-    quant_method = vllm_config.model_config.hf_config.quantization_config[
-        "quant_method"] if hasattr(vllm_config.model_config.hf_config,
-                                   "quantization_config") else None
+    hf_config = vllm_config.model_config.hf_config
     default_quantization_config = get_default_qwix_quantization_config(
-        model_type, quant_method,
-        vllm_config.additional_config.get("skip_quantization", False))
+        hf_config, vllm_config.additional_config.get("skip_quantization",
+                                                     False))
     maybe_existing_quantization_config = vllm_config.additional_config.get(
         "quantization")
@@ -503,7 +557,14 @@ def get_random_sharded_array(key: PRNGKey, mesh: Mesh, param: nnx.Param,
         maxval = jnp.array(jnp.iinfo(dtype).max, dtype=dtype)
         weight = jax.random.randint(key, param_shape, minval, maxval, dtype)
     else:
-        weight = jax.random.normal(key, param_shape, dtype)
+        # NOTE: _uniform() in random.py does not accept float4_e2m1fn
+        # Error: "TypeError: uniform only accepts 8-, 16-, 32-, or 64-bit dtypesgot float4_e2m1fn."
+        # Workaround: call function with dtype jnp.float8_e4m3fn and cast back to float4_e2m1fn
+        if dtype != "float4_e2m1fn":
+            weight = jax.random.normal(key, param_shape, dtype)
+        else:
+            weight = jax.random.normal(key, param_shape,
+                                       jnp.float8_e4m3fn).astype(dtype)
     def get_slice(index):
         return weight[index]
@@ -538,18 +599,16 @@ def load_random_weights_into_qwix_abstract_model(rng: PRNGKey,
     logger.info("Initializing Qwix-quantized model with random weights...")
     # TODO (jacobplatin): clean up this logic
     scale_dtype = model.weight_loader.scale_dtype
-    scale_shape_map = model.weight_loader.scale_shap_map_for_random_weight_loading if hasattr(
+    scale_shape_map = model.weight_loader.scale_shape_map_for_random_weight_loading if hasattr(
         model.weight_loader,
-        'scale_shap_map_for_random_weight_loading') else {}
+        'scale_shape_map_for_random_weight_loading') else {}
     quantization_block_sizes = quantization_config["weight_block_size"]
     assert len(
         quantization_block_sizes
     ) == 2, f"Expected only 2 quantization block sizes but got {quantization_block_sizes}"
-    quantization_block_size_n, _ = quantization_block_sizes[
-        0], quantization_block_sizes[1]
     # Iterate through all variables and initialize them
-    prev_param_shape = None
     for path, param in nnx.iter_graph(model):
         if not isinstance(param, nnx.Variable):
             continue
@@ -559,16 +618,17 @@ def load_random_weights_into_qwix_abstract_model(rng: PRNGKey,
         is_qwix_scale = (path[-1] == 'scale' and path[-2] == "array")
         param_dtype = scale_dtype if is_qwix_scale else param.value.dtype
         param_shape = param.value.shape
-        # TODO (jacobplatin): clean this up
         if is_qwix_scale:
-            param_shape = scale_shape_map.get(
-                path[3],
-                tuple(dim // quantization_block_size_n
-                      for dim in prev_param_shape))
+            key = f"{path[2]}.{path[3]}"
+            if key in scale_shape_map:
+                param_shape = scale_shape_map[key]
+            else:
+                raise ValueError(
+                    f"Scale shape for {key} not found in scale_shape_map.")
         param.value = get_random_sharded_array(
             rng, mesh, param, param_shape, param_dtype,
             ".".join([str(x) for x in path]))
-        prev_param_shape = param_shape
     # Handles the DeepSeek case, where this needs to be called to make the cache weights
     # concrete

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl