PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511150811__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511150811__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (179) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_dp_scheduler.py +899 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/fused_moe_v1_test.py +105 -0
tests/kernels/mla_v1_test.py +396 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +549 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/conftest.py +32 -0
tests/lora/test_bgmv.py +43 -0
tests/lora/test_layers.py +654 -0
tests/lora/test_lora.py +133 -0
tests/lora/utils.py +96 -0
tests/test_base.py +201 -0
tests/test_envs.py +182 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +236 -0
tpu_inference/__init__.py +34 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/core/sched/__init__.py +0 -0
tpu_inference/core/sched/dp_scheduler.py +523 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/jax_parallel_state.py +67 -0
tpu_inference/distributed/tpu_connector.py +728 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +107 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +362 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +0 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +0 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1035 -0
tpu_inference/kernels/mla/__init__.py +0 -0
tpu_inference/kernels/mla/v1/__init__.py +0 -0
tpu_inference/kernels/mla/v1/kernel.py +1349 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1478 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4147 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +367 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +51 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_interface.py +390 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +8 -0
tpu_inference/layers/common/sharding.py +582 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +255 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +262 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +185 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +280 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +96 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +76 -0
tpu_inference/layers/jax/transformer_block.py +107 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +507 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +39 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +120 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +203 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +266 -0
tpu_inference/layers/vllm/quantization/unquantized.py +386 -0
tpu_inference/layers/vllm/sharding.py +230 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +311 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +444 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/gpt_oss.py +492 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +79 -0
tpu_inference/models/jax/llama3.py +375 -0
tpu_inference/models/jax/llama4.py +629 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +1103 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +163 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +5 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +6 -0
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +5 -0
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +6 -0
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +105 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +653 -0
tpu_inference/models/jax/utils/weight_utils.py +529 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +286 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_platform.py +269 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +780 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +132 -0
tpu_inference/runner/kv_cache_manager.py +479 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +217 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +248 -0
tpu_inference/runner/structured_decoding_manager.py +88 -0
tpu_inference/runner/tpu_runner.py +1620 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +367 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +317 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/tpu_worker.py +321 -0
tpu_inference-0.11.1.dev202511150811.dist-info/METADATA +107 -0
tpu_inference-0.11.1.dev202511150811.dist-info/RECORD +179 -0
tpu_inference-0.11.1.dev202511150811.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dev202511150811.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dev202511150811.dist-info/top_level.txt +2 -0

tpu_inference/models/jax/qwen3.py ADDED Viewed

@@ -0,0 +1,302 @@
+from typing import List, Optional, Tuple
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from jax.sharding import Mesh
+from transformers import Qwen3Config
+from vllm.config import VllmConfig
+from tpu_inference import utils
+from tpu_inference.layers.common.attention_interface import attention
+from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.layers.jax.rope_interface import apply_rope
+from tpu_inference.logger import init_logger
+from tpu_inference.models.jax.qwen2 import Qwen2DecoderLayer
+from tpu_inference.models.jax.qwen2 import Qwen2MLP as Qwen3MLP
+from tpu_inference.models.jax.qwen2 import Qwen2Model
+from tpu_inference.models.jax.utils.weight_utils import (get_default_maps,
+                                                         load_hf_weights)
+logger = init_logger(__name__)
+init_fn = nnx.initializers.uniform()
+class Qwen3Attention(nnx.Module):
+    def __init__(self, config: Qwen3Config, dtype: jnp.dtype, rng: nnx.Rngs,
+                 mesh: Mesh, kv_cache_dtype: str):
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.rms_norm_eps = config.rms_norm_eps
+        self.head_dim_original = getattr(config, "head_dim",
+                                         self.hidden_size // self.num_heads)
+        self.head_dim = utils.get_padded_head_dim(self.head_dim_original)
+        sharding_size = mesh.shape["model"]
+        self.num_heads = utils.get_padded_num_heads(self.num_heads,
+                                                    sharding_size)
+        self.num_kv_heads = utils.get_padded_num_heads(self.num_kv_heads,
+                                                       sharding_size)
+        self.mesh = mesh
+        self.q_proj = nnx.Einsum(
+            "TD,DNH->TNH",
+            (self.hidden_size, self.num_heads, self.head_dim),
+            param_dtype=dtype,
+            kernel_init=nnx.with_partitioning(init_fn, (None, "model", None)),
+            rngs=rng,
+        )
+        self.q_norm = nnx.RMSNorm(
+            self.head_dim,
+            epsilon=self.rms_norm_eps,
+            param_dtype=dtype,
+            scale_init=nnx.with_partitioning(init_fn, (None, )),
+            rngs=rng,
+        )
+        self.k_proj = nnx.Einsum(
+            "TD,DKH->TKH",
+            (self.hidden_size, self.num_kv_heads, self.head_dim),
+            param_dtype=dtype,
+            kernel_init=nnx.with_partitioning(init_fn, (None, "model", None)),
+            rngs=rng,
+        )
+        self.k_norm = nnx.RMSNorm(
+            self.head_dim,
+            epsilon=self.rms_norm_eps,
+            param_dtype=dtype,
+            scale_init=nnx.with_partitioning(init_fn, (None, )),
+            rngs=rng,
+        )
+        self.v_proj = nnx.Einsum(
+            "TD,DKH->TKH",
+            (self.hidden_size, self.num_kv_heads, self.head_dim),
+            param_dtype=dtype,
+            kernel_init=nnx.with_partitioning(init_fn, (None, "model", None)),
+            rngs=rng,
+        )
+        self.o_proj = nnx.Einsum(
+            "TNH,NHD->TD",
+            (self.num_heads, self.head_dim, self.hidden_size),
+            param_dtype=dtype,
+            kernel_init=nnx.with_partitioning(init_fn, ("model", None, None)),
+            rngs=rng,
+        )
+        self._q_scale = 1.0
+        self._k_scale = 1.0
+        self._v_scale = 1.0
+        self.kv_cache_quantized_dtype = None
+        if kv_cache_dtype != "auto":
+            self.kv_cache_quantized_dtype = utils.get_jax_dtype_from_str_dtype(
+                kv_cache_dtype)
+    def __call__(
+        self,
+        kv_cache: Optional[jax.Array],
+        x: jax.Array,
+        attention_metadata: AttentionMetadata,
+    ) -> Tuple[jax.Array, jax.Array]:
+        md = attention_metadata
+        # q: (T, N, H)
+        q = self.q_proj(x)
+        q = self.q_norm(q)
+        q = apply_rope(q, md.input_positions, self.head_dim_original,
+                       self.rope_theta, self.rope_scaling)
+        # k: (T, K, H)
+        k = self.k_proj(x)
+        k = self.k_norm(k)
+        k = apply_rope(k, md.input_positions, self.head_dim_original,
+                       self.rope_theta, self.rope_scaling)
+        # v: (T, K, H)
+        v = self.v_proj(x)
+        # o: (T, N, H)
+        q_scale = k_scale = v_scale = None
+        if self.kv_cache_quantized_dtype:
+            # TODO(kyuyeunk/jacobplatin): Enable w8a8 when VREG spill issue is resolved.
+            # q_scale = self._q_scale
+            k_scale = self._k_scale
+            v_scale = self._v_scale
+            k, v = utils.quantize_kv(k, v, self.kv_cache_quantized_dtype,
+                                     k_scale, v_scale)
+        new_kv_cache, outputs = attention(
+            kv_cache,
+            q,
+            k,
+            v,
+            attention_metadata,
+            self.mesh,
+            self.head_dim_original,
+            q_scale=q_scale,
+            k_scale=k_scale,
+            v_scale=v_scale,
+        )
+        # (T, D)
+        o = self.o_proj(outputs)
+        return new_kv_cache, o
+class Qwen3DecoderLayer(Qwen2DecoderLayer):
+    def __init__(self, config: Qwen3Config, dtype: jnp.dtype, rng: nnx.Rngs,
+                 mesh: Mesh, kv_cache_dtype: str):
+        rms_norm_eps = config.rms_norm_eps
+        hidden_size = config.hidden_size
+        self.input_layernorm = nnx.RMSNorm(
+            hidden_size,
+            epsilon=rms_norm_eps,
+            param_dtype=dtype,
+            scale_init=nnx.with_partitioning(init_fn, (None, )),
+            rngs=rng,
+        )
+        self.self_attn = Qwen3Attention(config=config,
+                                        dtype=dtype,
+                                        rng=rng,
+                                        mesh=mesh,
+                                        kv_cache_dtype=kv_cache_dtype)
+        self.post_attention_layernorm = nnx.RMSNorm(
+            hidden_size,
+            epsilon=rms_norm_eps,
+            param_dtype=dtype,
+            scale_init=nnx.with_partitioning(init_fn, (None, )),
+            rngs=rng,
+        )
+        self.mlp = Qwen3MLP(
+            config=config,
+            dtype=dtype,
+            rng=rng,
+        )
+class Qwen3Model(Qwen2Model):
+    def __init__(self, vllm_config: VllmConfig, rng: nnx.Rngs,
+                 mesh: Mesh) -> None:
+        model_config = vllm_config.model_config
+        hf_config = model_config.hf_config
+        vocab_size = model_config.get_vocab_size()
+        dtype = model_config.dtype
+        rms_norm_eps = hf_config.rms_norm_eps
+        hidden_size = hf_config.hidden_size
+        self.embed = nnx.Embed(
+            num_embeddings=vocab_size,
+            features=hidden_size,
+            param_dtype=dtype,
+            embedding_init=nnx.with_partitioning(init_fn, ("model", None)),
+            rngs=rng,
+        )
+        self.layers = [
+            Qwen3DecoderLayer(
+                config=hf_config,
+                dtype=dtype,
+                rng=rng,
+                mesh=mesh,
+                # TODO (jacobplatin): we should refactor this to pass a dtype (or config) directly
+                kv_cache_dtype=vllm_config.cache_config.cache_dtype)
+            for _ in range(hf_config.num_hidden_layers)
+        ]
+        self.norm = nnx.RMSNorm(
+            hidden_size,
+            epsilon=rms_norm_eps,
+            param_dtype=dtype,
+            scale_init=nnx.with_partitioning(init_fn, (None, )),
+            rngs=rng,
+        )
+        if model_config.hf_config.tie_word_embeddings:
+            self.lm_head = self.embed.embedding
+        else:
+            self.lm_head = nnx.Param(
+                init_fn(rng.params(), (hidden_size, vocab_size), dtype),
+                sharding=(None, "model"),
+            )
+class Qwen3ForCausalLM(nnx.Module):
+    def __init__(self, vllm_config: VllmConfig, rng_key: jax.Array,
+                 mesh: Mesh) -> None:
+        self.vllm_config = vllm_config
+        self.rng = nnx.Rngs(rng_key)
+        self.mesh = mesh
+        self.model = Qwen3Model(
+            vllm_config=vllm_config,
+            rng=self.rng,
+            mesh=mesh,
+        )
+    def __call__(
+        self,
+        kv_caches: List[jax.Array],
+        input_ids: jax.Array,
+        attention_metadata: AttentionMetadata,
+        *args,
+    ) -> Tuple[List[jax.Array], jax.Array, List[jax.Array]]:
+        kv_caches, x = self.model(
+            kv_caches,
+            input_ids,
+            attention_metadata,
+        )
+        return kv_caches, x, []
+    def compute_logits(self, hidden_states: jax.Array) -> jax.Array:
+        if self.vllm_config.model_config.hf_config.tie_word_embeddings:
+            logits = jnp.dot(hidden_states, self.model.lm_head.value.T)
+        else:
+            logits = jnp.dot(hidden_states, self.model.lm_head.value)
+        return logits
+    def load_weights(self, rng_key: jax.Array):
+        # NOTE: Since we are using nnx.eval_shape to init the model,
+        # we have to pass dynamic arrays here for __call__'s usage.
+        self.rng = nnx.Rngs(rng_key)
+        # Key: path to a HF layer weight
+        # Value: path to a nnx layer weight
+        mappings = {
+            "model.embed_tokens": "model.embed.embedding",
+            "model.layers.*.input_layernorm":
+            "model.layers.*.input_layernorm.scale",
+            "model.layers.*.mlp.down_proj":
+            "model.layers.*.mlp.down_proj.kernel",
+            "model.layers.*.mlp.gate_proj":
+            "model.layers.*.mlp.gate_proj.kernel",
+            "model.layers.*.mlp.up_proj": "model.layers.*.mlp.up_proj.kernel",
+            "model.layers.*.post_attention_layernorm":
+            "model.layers.*.post_attention_layernorm.scale",
+            "model.layers.*.self_attn.k_norm":
+            "model.layers.*.self_attn.k_norm.scale",
+            "model.layers.*.self_attn.k_proj":
+            "model.layers.*.self_attn.k_proj.kernel",
+            "model.layers.*.self_attn.o_proj":
+            "model.layers.*.self_attn.o_proj.kernel",
+            "model.layers.*.self_attn.q_norm":
+            "model.layers.*.self_attn.q_norm.scale",
+            "model.layers.*.self_attn.q_proj":
+            "model.layers.*.self_attn.q_proj.kernel",
+            "model.layers.*.self_attn.v_proj":
+            "model.layers.*.self_attn.v_proj.kernel",
+            "model.norm": "model.norm.scale",
+        }
+        # Add lm_head mapping only if it's not tied to embeddings
+        if not self.vllm_config.model_config.hf_config.tie_word_embeddings:
+            mappings.update({
+                "lm_head": "model.lm_head",
+            })
+        metadata_map = get_default_maps(self.vllm_config, self.mesh, mappings)
+        load_hf_weights(vllm_config=self.vllm_config,
+                        model=self,
+                        metadata_map=metadata_map,
+                        mesh=self.mesh)

tpu_inference/models/jax/utils/__init__.py ADDED Viewed

File without changes

tpu_inference/models/jax/utils/file_utils.py ADDED Viewed

@@ -0,0 +1,96 @@
+import glob
+import hashlib
+import os
+import shutil
+import subprocess
+from typing import List, Optional
+import filelock
+import huggingface_hub.constants
+from huggingface_hub import HfFileSystem, snapshot_download
+from tqdm.auto import tqdm
+from tpu_inference.logger import init_logger
+logger = init_logger(__name__)
+# Do not set the HuggingFace token here, it should be set via the env `HF_TOKEN`.
+hfs = HfFileSystem()
+LOCK_DIR = "/tmp/lock"
+#####  Local file utils  #####
+def run_cmd(cmd: str, *args, **kwargs) -> subprocess.CompletedProcess:
+    return subprocess.run(cmd.split(), *args, **kwargs)
+def delete_file(path: str) -> None:
+    if os.path.isfile(path):
+        os.remove(path)
+    else:
+        logger.error(f"Trying to delete non-existing file: {path}")
+def list_files(dir: str, pattern: str = "*") -> List[str]:
+    files = glob.glob(os.path.join(dir, pattern))
+    return files
+def get_lock(model_name_or_path: str):
+    lock_dir = LOCK_DIR
+    model_name_or_path = str(model_name_or_path)
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    # add hash to avoid conflict with old users' lock files
+    lock_file_name = hash_name + model_name + ".lock"
+    # mode 0o666 is required for the filelock to be shared across users
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
+                             mode=0o666)
+    return lock
+def get_free_disk_size(path: str = "/") -> int:
+    free_bytes = shutil.disk_usage(path)[2]
+    return free_bytes
+#####  HuggingFace file utils  #####
+def is_hf_repo(repo_id: str) -> bool:
+    return hfs.exists(repo_id)
+def list_hf_repo(repo_id: str, pattern: str = "**") -> List[str]:
+    repo_files = hfs.glob(os.path.join(repo_id, pattern))
+    return repo_files
+def get_hf_model_weights_size(repo_id: str, weights_format: str) -> int:
+    weights_paths = list_hf_repo(repo_id, weights_format)
+    weights_size = 0
+    for weights_path in weights_paths:
+        weights_size += int(hfs.info(weights_path)["size"])
+    return weights_size
+class DisabledTqdm(tqdm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, disable=True)
+def download_model_weights_from_hf(model_path: str, cache_dir: Optional[str],
+                                   weights_format: str) -> str:
+    with get_lock(model_path):
+        local_dir = snapshot_download(
+            model_path,
+            cache_dir=cache_dir,  # can be specified by HF_HOME or HF_HUB_CACHE
+            allow_patterns=weights_format,
+            tqdm_class=DisabledTqdm,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+        )
+    local_files = list_files(local_dir, weights_format)
+    return local_files

tpu_inference/models/jax/utils/multi_modal_utils.py ADDED Viewed

@@ -0,0 +1,163 @@
+from typing import Union
+import jax
+import jax.numpy as jnp
+from typing_extensions import TypeAlias
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+NestedTensors: TypeAlias = Union[list["NestedTensors"], list["jax.Array"],
+                                 "jax.Array", tuple["jax.Array", ...]]
+"""
+Uses a list instead of a tensor if the dimensions of each element do not match.
+"""
+MultiModalEmbeddings = Union[list[jax.Array], jax.Array, tuple[jax.Array, ...]]
+"""
+The output embeddings must be one of the following formats:
+- A list or tuple of 2D tensors, where each tensor corresponds to
+    each input multimodal data item (e.g, image).
+- A single 3D tensor, with the batch dimension grouping the 2D tensors.
+"""
+def sanity_check_mm_encoder_outputs(
+    mm_embeddings: MultiModalEmbeddings,
+    expected_num_items: int,
+) -> None:
+    """
+    Perform sanity checks for the result of
+    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
+    """
+    assert isinstance(mm_embeddings, (list, tuple, jax.Array)), (
+        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
+        f"or a single 3D tensor, but got {type(mm_embeddings)} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+    assert len(mm_embeddings) == expected_num_items, (
+        "Expected number of multimodal embeddings to match number of "
+        f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+    assert all(e.ndim == 2 for e in mm_embeddings), (
+        "Expected multimodal embeddings to be a sequence of 2D tensors, "
+        f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+def flatten_embeddings(embeddings: NestedTensors) -> jax.Array:
+    """
+    Recursively flattens and concatenates NestedTensors on all but the last
+    dimension.
+    """
+    if isinstance(embeddings, jax.Array):
+        return embeddings.reshape(-1, embeddings.shape[-1])
+    return jnp.concatenate([flatten_embeddings(t) for t in embeddings], axis=0)
+def _embedding_count_expression(embeddings: NestedTensors) -> str:
+    """
+    Constructs a debugging representation of the number of embeddings in the
+    NestedTensors.
+    """
+    if isinstance(embeddings, jax.Array):
+        return " x ".join([str(dim) for dim in embeddings.shape[:-1]])
+    return " + ".join(
+        _embedding_count_expression(inner) for inner in embeddings)
+def _merge_multimodal_embeddings(
+    inputs_embeds: jax.Array,
+    is_multimodal: jax.Array,
+    multimodal_embeddings: jax.Array,
+) -> jax.Array:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    ``input_ids``.
+        This returns a new array with the updated values.
+    Note:
+        This returns a new array with the updated values.
+    """
+    # The check for matching number of tokens is removed as it is not
+    # JIT-compatible. If the shapes mismatch, JAX will raise an error
+    # during execution anyway. The user-friendly error message is
+    # sacrificed for JIT compatibility.
+    # JIT-compatible implementation using jnp.where to avoid
+    # NonConcreteBooleanIndexError.
+    # Create a dummy row to handle indices for non-multimodal tokens.
+    # The content of the dummy row does not matter as it will be masked out.
+    dummy_row = jnp.zeros_like(multimodal_embeddings[0:1])
+    # Prepend the dummy row to the flattened embeddings.
+    flattened_padded = jnp.concatenate([dummy_row, multimodal_embeddings],
+                                       axis=0)
+    # Create gather indices. For each token in the input sequence, this gives
+    # the index into `flattened_padded`.
+    # For non-multimodal tokens, the index will be 0 (pointing to the dummy
+    # row). For the k-th multimodal token, the index will be k.
+    gather_indices = jnp.cumsum(is_multimodal)
+    # Gather the embeddings to be placed.
+    update_values = flattened_padded[gather_indices]
+    # Use jnp.where to select between original and new embeddings.
+    condition = jnp.expand_dims(is_multimodal, axis=-1)
+    return jnp.where(condition, update_values, inputs_embeds)
+def merge_multimodal_embeddings(
+    input_ids: jax.Array,
+    inputs_embeds: jax.Array,
+    multimodal_embeddings: jax.Array,
+    placeholder_token_id: Union[int, list[int]],
+) -> jax.Array:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    ``input_ids``.
+    ``placeholder_token_id`` can be a list of token ids (e.g, token ids
+    of img_start, img_break, and img_end tokens) when needed: This means
+    the order of these tokens in the ``input_ids`` MUST MATCH the order of
+    their embeddings in ``multimodal_embeddings`` since we need to
+    slice-merge instead of individually scattering.
+    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
+    - T is text token
+    - S is image start token
+    - I is image embedding token
+    - B is image break token
+    - E is image end token.
+    Then the image embeddings (that correspond to I's) from vision encoder
+    must be padded with embeddings of S, B, and E in the same order of
+    input_ids for a correct embedding merge.
+        This returns a new array with the updated values.
+    """
+    if isinstance(placeholder_token_id, list):
+        placeholder_token_id = jnp.array(placeholder_token_id)
+        return _merge_multimodal_embeddings(
+            inputs_embeds,
+            jnp.isin(input_ids, placeholder_token_id),
+            multimodal_embeddings,
+        )
+    return _merge_multimodal_embeddings(
+        inputs_embeds,
+        (input_ids == placeholder_token_id),
+        multimodal_embeddings,
+    )

tpu_inference/models/jax/utils/quantization/__init__.py ADDED Viewed

File without changes

tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+qwix:
+  rules:
+    # NOTE: each entry corresponds to a qwix.QuantizationRule
+    - module_path: '.*'
+      weight_qtype: 'float8_e4m3fn'

tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+qwix:
+  rules:
+    # NOTE: each entry corresponds to a qwix.QuantizationRule
+    - module_path: '.*'
+      weight_qtype: 'float8_e4m3fn'
+      act_qtype: 'float8_e4m3fn'

tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+qwix:
+  rules:
+    # NOTE: each entry corresponds to a qwix.QuantizationRule
+    - module_path: '.*'
+      weight_qtype: 'int8'

tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+qwix:
+  rules:
+    # NOTE: each entry corresponds to a qwix.QuantizationRule
+    - module_path: '.*'
+      weight_qtype: 'int8'
+      act_qtype: 'int8'