PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (59) hide show

tests/kernels/fused_moe_v1_test.py +303 -34
tests/kernels/mla_v1_test.py +129 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +3 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +3 -1
tests/lora/test_layers.py +4 -1
tests/lora/test_lora_perf.py +53 -0
tests/test_envs.py +110 -12
tests/test_quantization.py +3 -0
tests/test_utils.py +1 -2
tpu_inference/distributed/tpu_connector.py +1 -1
tpu_inference/envs.py +92 -8
tpu_inference/executors/ray_distributed_executor.py +5 -1
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/fused_moe/v1/kernel.py +712 -143
tpu_inference/kernels/mla/v1/kernel.py +98 -120
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +82 -32
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +146 -85
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/util.py +2 -1
tpu_inference/layers/common/attention_interface.py +7 -1
tpu_inference/layers/common/sharding.py +11 -7
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +232 -64
tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
tpu_inference/layers/vllm/fused_moe.py +170 -208
tpu_inference/layers/vllm/linear_common.py +43 -21
tpu_inference/layers/vllm/quantization/common.py +11 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +4 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +74 -65
tpu_inference/layers/vllm/quantization/mxfp4.py +140 -94
tpu_inference/layers/vllm/quantization/unquantized.py +103 -80
tpu_inference/models/common/model_loader.py +78 -22
tpu_inference/models/jax/deepseek_v3.py +185 -64
tpu_inference/models/jax/gpt_oss.py +3 -3
tpu_inference/models/jax/llama_eagle3.py +4 -5
tpu_inference/models/jax/qwen2_5_vl.py +161 -47
tpu_inference/models/jax/utils/quantization/quantization_utils.py +7 -8
tpu_inference/models/jax/utils/weight_utils.py +203 -155
tpu_inference/models/vllm/vllm_model_wrapper.py +11 -5
tpu_inference/platforms/tpu_platform.py +29 -48
tpu_inference/runner/compilation_manager.py +112 -46
tpu_inference/runner/kv_cache.py +40 -20
tpu_inference/runner/kv_cache_manager.py +40 -31
tpu_inference/runner/persistent_batch_manager.py +40 -2
tpu_inference/runner/structured_decoding_manager.py +2 -3
tpu_inference/runner/tpu_runner.py +94 -51
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +71 -22
tpu_inference/utils.py +41 -14
tpu_inference/worker/tpu_worker.py +43 -45
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/METADATA +8 -9
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/RECORD +59 -58
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.12.0.dev20251213.dist-info}/top_level.txt +0 -0

tpu_inference/models/jax/qwen2_5_vl.py CHANGED Viewed

@@ -486,6 +486,11 @@ class Qwen2_5_VisionTransformer(nnx.Module):
             dtype=dtype,
             rngs=rngs)
+        additional_config = getattr(vllm_config, "additional_config",
+                                    None) or {}
+        self.enable_dynamic_image_sizes = additional_config.get(
+            "enable_dynamic_image_sizes", False)
     def rotary_pos_emb_thw(self, t, h, w):
         hpos_ids, wpos_ids = jnp.indices((h, w))
         hpos_ids = hpos_ids.reshape(
@@ -579,21 +584,7 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         return max_seqlen, seqlens
-    def __call__(self, x: jax.Array, grid_thw: tuple[tuple[int, int,
-                                                           int]]) -> jax.Array:
-        # x: pixel_values: jax.Array
-        # """Shape:
-        # `(num_patches, num_channels * patch_size * patch_size)`
-        # """
-        # grid_thw: image_grid_thw: jax.Array
-        # """Shape: `(num_images, 3)`
-        # This should be in `(grid_t, grid_h, grid_w)` format.
-        # """
-        hidden_states = self.patch_embed(x)
-        # num of patches
-        seq_len = x.shape[0]
+    def compute_aux_arrays(self, grid_thw: tuple[tuple[int, int, int]]):
         # num of images/videoes
         num_grids = len(grid_thw)
@@ -638,6 +629,42 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         cu_seqlens = jnp.pad(cu_seqlens, ((1, 0), ),
                              mode='constant',
                              constant_values=0)
+        return window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens
+    def pad_inputs(self, x, window_index, rotary_pos_emb, cu_seqlens,
+                   cu_window_seqlens):
+        # padding
+        num_patches = int(rotary_pos_emb.shape[0])
+        bucket_num_patches = 1 << (num_patches - 1).bit_length()
+        num_tokens = window_index.shape[0]
+        bucket_num_tokens = bucket_num_patches // self.spatial_merge_unit
+        vit_merger_window_size = (self.window_size //
+                                  self.spatial_merge_size // self.patch_size)
+        max_windows = (bucket_num_tokens // vit_merger_window_size) + 2
+        rotary_pos_emb = jnp.pad(rotary_pos_emb,
+                                 ((0, bucket_num_patches - num_patches),
+                                  (0, 0)))
+        window_index = jnp.concatenate([
+            window_index,
+            jnp.arange(num_tokens, bucket_num_tokens, dtype=jnp.int32)
+        ])
+        cu_window_seqlens = jnp.append(cu_window_seqlens, bucket_num_patches)
+        pad_w = max(0, max_windows + 1 - cu_window_seqlens.shape[0])
+        cu_window_seqlens = jnp.pad(cu_window_seqlens, (0, pad_w), mode='edge')
+        cu_seqlens = jnp.append(cu_seqlens, bucket_num_patches)
+        x_padded = jnp.pad(x, ((0, bucket_num_patches - x.shape[0]), (0, 0)))
+        return x_padded, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens, num_tokens
+    def compute_hidden_states(self, x: jax.Array, window_index: jax.Array,
+                              rotary_pos_emb: jax.Array, cu_seqlens: jax.Array,
+                              cu_window_seqlens: jax.Array) -> jax.Array:
+        hidden_states = self.patch_embed(x)
+        # num of patches
+        seq_len = x.shape[0]
         hidden_states = hidden_states.reshape(
             seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
@@ -664,6 +691,48 @@ class Qwen2_5_VisionTransformer(nnx.Module):
         hidden_states = hidden_states[reverse_indices, :]
         return hidden_states
+    @jax.jit
+    def encode_padded_jit(self, x_padded, window_index, rotary_pos_emb,
+                          cu_seqlens, cu_window_seqlens):
+        return self.compute_hidden_states(x_padded, window_index,
+                                          rotary_pos_emb, cu_seqlens,
+                                          cu_window_seqlens)
+    @partial(
+        jax.jit,
+        static_argnames=("grid_thw", ),
+    )
+    def encode_jit(self, x, grid_thw):
+        window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens = self.compute_aux_arrays(
+            grid_thw)
+        return self.compute_hidden_states(x, window_index, rotary_pos_emb,
+                                          cu_seqlens, cu_window_seqlens)
+    def __call__(self, x: jax.Array, grid_thw: tuple[tuple[int, int,
+                                                           int]]) -> jax.Array:
+        # x: pixel_values: jax.Array
+        # """Shape:
+        # `(num_patches, num_channels * patch_size * patch_size)`
+        # """
+        # grid_thw: image_grid_thw: jax.Array
+        # """Shape: `(num_images, 3)`
+        # This should be in `(grid_t, grid_h, grid_w)` format.
+        # """
+        if self.enable_dynamic_image_sizes:
+            window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens = self.compute_aux_arrays(
+                grid_thw)
+            x_padded, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens, num_tokens = self.pad_inputs(
+                x, window_index, rotary_pos_emb, cu_seqlens, cu_window_seqlens)
+            hidden_states = self.encode_padded_jit(x_padded, window_index,
+                                                   rotary_pos_emb, cu_seqlens,
+                                                   cu_window_seqlens)
+            return hidden_states[:num_tokens]
+        else:
+            return self.encode_jit(x, grid_thw)
 class Qwen2_5_VLForConditionalGeneration(nnx.Module):
@@ -888,10 +957,6 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
             #         "video"] = self._parse_and_validate_video_input(**kwargs)
         return mm_input_by_modality
-    @partial(
-        jax.jit,
-        static_argnames=("image_grid_thw", ),
-    )
     def get_single_image_embedding(self, image_pixel_values, image_grid_thw):
         return self.visual(image_pixel_values, (image_grid_thw, ))
@@ -1072,33 +1137,82 @@ class Qwen2_5_VLForConditionalGeneration(nnx.Module):
         self,
         run_compilation_fn: Callable,
     ) -> None:
-        image_shapes = []
-        if (warmup_config := self.vllm_config.additional_config.get(
-                "vision_warmup_config")):
-            image_shapes = warmup_config.get("image_shapes")
         vc = self.vllm_config.model_config.hf_config.vision_config
-        factor = vc.patch_size * vc.spatial_merge_size
-        for input_hw in image_shapes:
-            if not isinstance(input_hw, list) or len(input_hw) != 2:
-                logger.warning(f"Skipping invalid shape {input_hw}.")
-                continue
-            h_input, w_input = input_hw
-            h_processed = round(h_input / factor) * factor
-            w_processed = round(w_input / factor) * factor
-            t, h, w = 1, h_processed // vc.patch_size, w_processed // vc.patch_size
-            grid_thw = (t, h, w)
-            num_patches = t * h * w
-            patch_input_dim = vc.in_channels * vc.temporal_patch_size * vc.patch_size * vc.patch_size
-            dummy_pixel_values = jnp.ones(
-                (num_patches, patch_input_dim),
-                self.vllm_config.model_config.dtype,
-            )
-            dummy_grid_thw = grid_thw
+        patch_input_dim = vc.in_channels * vc.temporal_patch_size * vc.patch_size * vc.patch_size
+        if self.visual.enable_dynamic_image_sizes:
+            spatial_merge_unit = vc.spatial_merge_size**2
+            max_num_batched_tokens = self.vllm_config.scheduler_config.max_num_batched_tokens
+            mm_kwargs = self.vllm_config.model_config.multimodal_config.mm_processor_kwargs or {}
+            limit_pixels = float(mm_kwargs.get("max_pixels", float('inf')))
+            max_patches = int(
+                min(max_num_batched_tokens * spatial_merge_unit,
+                    limit_pixels / (vc.patch_size**2)))
+            num_patches_paddings = [
+                1 << i for i in range(4, (max_patches - 1).bit_length() + 1)
+            ]
+            rotary_dim = vc.hidden_size // vc.num_heads // 2
+            vit_merger_window_size = (vc.window_size //
+                                      vc.spatial_merge_size // vc.patch_size)
+            for num_patches in num_patches_paddings:
+                dummy_x_padded = jnp.ones(
+                    (num_patches, patch_input_dim),
+                    dtype=self.vllm_config.model_config.dtype)
+                num_tokens = num_patches // spatial_merge_unit
+                dummy_window_index = jnp.arange(num_tokens, dtype=jnp.int32)
+                dummy_rotary_pos_emb = jnp.ones(
+                    (num_patches, rotary_dim),
+                    dtype=self.vllm_config.model_config.dtype)
+                dummy_cu_seqlens = jnp.array([0, num_patches, num_patches],
+                                             dtype=jnp.int32)
+                max_windows = (num_tokens // vit_merger_window_size) + 2
+                patches_per_window = (vit_merger_window_size**
+                                      2) * spatial_merge_unit
+                dummy_cu_window_seqlens = jnp.arange(
+                    max_windows + 1, dtype=jnp.int32) * patches_per_window
+                dummy_cu_window_seqlens = jnp.minimum(dummy_cu_window_seqlens,
+                                                      num_patches)
+                run_compilation_fn("vision_encoder_padded",
+                                   self.visual.encode_padded_jit,
+                                   dummy_x_padded,
+                                   dummy_window_index,
+                                   dummy_rotary_pos_emb,
+                                   dummy_cu_seqlens,
+                                   dummy_cu_window_seqlens,
+                                   num_patches=num_patches)
+        else:
+            image_shapes = []
+            if (warmup_config := self.vllm_config.additional_config.get(
+                    "vision_warmup_config")):
+                image_shapes = warmup_config.get("image_shapes")
+            factor = vc.patch_size * vc.spatial_merge_size
+            for input_hw in image_shapes:
+                if not isinstance(input_hw, list) or len(input_hw) != 2:
+                    logger.warning(f"Skipping invalid shape {input_hw}.")
+                    continue
+                h_input, w_input = input_hw
+                h_processed = round(h_input / factor) * factor
+                w_processed = round(w_input / factor) * factor
+                t, h, w = 1, h_processed // vc.patch_size, w_processed // vc.patch_size
+                grid_thw = (t, h, w)
+                num_patches = t * h * w
+                dummy_pixel_values = jnp.ones(
+                    (num_patches, patch_input_dim),
+                    self.vllm_config.model_config.dtype,
+                )
+                dummy_grid_thw = (grid_thw, )
-            run_compilation_fn("single_image_encoder",
-                               self.get_single_image_embedding,
-                               dummy_pixel_values,
-                               dummy_grid_thw,
-                               image_shape=input_hw)
+                run_compilation_fn("vision_encoder",
+                                   self.visual.encode_jit,
+                                   dummy_pixel_values,
+                                   dummy_grid_thw,
+                                   image_shape=input_hw)

tpu_inference/models/jax/utils/quantization/quantization_utils.py CHANGED Viewed

@@ -154,12 +154,9 @@ def qwix_quantize_nnx_model(model: nnx.Module, qwix_config: List[dict],
     logger.info(f"Memory usage before applying quantization of params: "
                 f"hbm={utils.hbm_usage_gb(jax.local_devices())}Gb")
-    # TODO (jacobplatin): we should refactor this to pass a dtype (or config) directly
-    kv_cache_jnp_dtype = utils.get_jax_dtype_from_str_dtype(kv_cache_dtype)
-    # Handle the case where kv_cache_dtype is "auto"
-    if kv_cache_jnp_dtype is None:
-        assert kv_cache_dtype == "auto", "kv_cache_dtype must be 'auto' if kv_cache_jnp_dtype is None"
+    if kv_cache_dtype != "auto":
+        kv_cache_jnp_dtype = utils.to_jax_dtype(kv_cache_dtype)
+    else:
         kv_cache_jnp_dtype = DEFAULT_KV_CACHE_DTYPE
     kv_caches = create_kv_caches(
@@ -169,9 +166,11 @@ def qwix_quantize_nnx_model(model: nnx.Module, qwix_config: List[dict],
         head_size=kv_cache_head_size,
         mesh=mesh,
         layer_names=[f"layer.{i}" for i in range(num_hidden_layers)],
-        cache_dtype=kv_cache_jnp_dtype)
+        cache_dtype=kv_cache_jnp_dtype,
+        use_mla=model.vllm_config.model_config.use_mla,
+    )
-    dp_size = mesh.shape.get("data", 1) * mesh.shape.get("attn", 1)
+    dp_size = model.vllm_config.sharding_config.total_dp_size
     # NOTE: the inputs don't need to match the actual ones, as long as the consumed weights are the same
     input_ids = jax.random.randint(rng,

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.12.0.dev20251213__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.12.0.dev20251213py3-none-any.whl