PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (67) hide show

tests/kernels/fused_moe_v1_test.py +34 -303
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +2 -2
tests/lora/test_layers.py +6 -0
tests/lora/utils.py +8 -0
tests/test_utils.py +16 -24
tpu_inference/__init__.py +3 -22
tpu_inference/core/core_tpu.py +9 -17
tpu_inference/core/disagg_utils.py +8 -6
tpu_inference/distributed/tpu_connector.py +4 -3
tpu_inference/distributed/utils.py +2 -3
tpu_inference/envs.py +8 -61
tpu_inference/executors/ray_distributed_executor.py +11 -31
tpu_inference/kernels/fused_moe/v1/kernel.py +110 -641
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +54 -77
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +143 -287
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -7
tpu_inference/layers/jax/attention/attention.py +1 -1
tpu_inference/layers/{common → jax}/attention_interface.py +2 -8
tpu_inference/layers/jax/sample/rejection_sampler.py +1 -1
tpu_inference/layers/jax/sample/sampling.py +2 -2
tpu_inference/layers/{common → jax}/sharding.py +5 -5
tpu_inference/layers/vllm/attention.py +1 -1
tpu_inference/layers/vllm/fused_moe.py +208 -170
tpu_inference/layers/vllm/quantization/__init__.py +3 -7
tpu_inference/layers/vllm/quantization/awq.py +3 -4
tpu_inference/layers/vllm/quantization/common.py +1 -6
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +2 -4
tpu_inference/layers/vllm/quantization/unquantized.py +67 -62
tpu_inference/layers/vllm/sharding.py +2 -2
tpu_inference/lora/torch_punica_tpu.py +2 -1
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/common/model_loader.py +12 -46
tpu_inference/models/jax/llama3.py +3 -4
tpu_inference/models/jax/llama_eagle3.py +5 -8
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +2 -3
tpu_inference/models/jax/qwen2_5_vl.py +50 -165
tpu_inference/models/jax/qwen3.py +2 -3
tpu_inference/models/jax/utils/quantization/quantization_utils.py +6 -3
tpu_inference/models/jax/utils/weight_utils.py +143 -198
tpu_inference/models/vllm/vllm_model_wrapper.py +14 -32
tpu_inference/platforms/tpu_platform.py +34 -47
tpu_inference/runner/compilation_manager.py +60 -145
tpu_inference/runner/kv_cache.py +2 -2
tpu_inference/runner/kv_cache_manager.py +18 -17
tpu_inference/runner/persistent_batch_manager.py +2 -40
tpu_inference/runner/structured_decoding_manager.py +3 -2
tpu_inference/runner/tpu_runner.py +135 -283
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/jax/eagle3.py +21 -71
tpu_inference/tpu_info.py +3 -4
tpu_inference/utils.py +15 -38
tpu_inference/worker/tpu_worker.py +26 -163
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/METADATA +3 -4
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/RECORD +63 -61
tests/test_envs.py +0 -203
tpu_inference/layers/common/quant_methods.py +0 -8
tpu_inference/layers/vllm/quantization/mxfp4.py +0 -331
tpu_inference/models/jax/llama_guard_4.py +0 -361
/tpu_inference/layers/{common → jax}/binary_search.py +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/WHEEL +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.0.1rc1.dist-info → tpu_inference-0.11.1.dev202511130813.dist-info}/top_level.txt +0 -0

tests/kernels/fused_moe_v1_test.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
-from absl.testing import absltest, parameterized
+from absl.testing import absltest
 from jax._src import test_util as jtu
 from jax.sharding import Mesh
@@ -10,15 +10,6 @@ from tpu_inference.kernels.fused_moe.v1.kernel import fused_ep_moe, ref_moe
 jax.config.parse_flags_with_absl()
-def cdiv(a, b):
-    assert b != 0
-    return (a + b - 1) // b
-def align_to(x, a):
-    return cdiv(x, a) * a
 def gen_moe_inputs(
     dtype,
     top_k,
@@ -28,14 +19,11 @@ def gen_moe_inputs(
     num_tokens,
     *,
     seed=1234,
-    has_bias=False,
 ):
     key = jax.random.key(seed)
-    k0, k1, k2, k3, k4, k5, k6 = jax.random.split(key, 7)
+    k0, k1, k2, k4, k5 = jax.random.split(key, 5)
     a = jax.random.normal(k0, (num_tokens, hidden_size),
                           dtype=jnp.float32).astype(dtype) / 10
     w1 = (jax.random.normal(
         k1,
         (num_experts, 2, hidden_size, intermediate_size),
@@ -43,54 +31,21 @@ def gen_moe_inputs(
     ) / 10).astype(dtype)
     w2 = (jax.random.normal(k2, (num_experts, intermediate_size, hidden_size),
                             dtype=jnp.float32) / 10).astype(dtype)
-    if has_bias:
-        b1 = (jax.random.normal(k3, (num_experts, 2, intermediate_size),
-                                dtype=jnp.float32) / 10).astype(dtype)
-        b2 = (jax.random.normal(k4, (num_experts, hidden_size),
-                                dtype=jnp.float32) / 10).astype(dtype)
-    else:
-        b1 = b2 = None
     gating_output = (
-        jax.random.normal(k5, (num_tokens, num_experts), dtype=jnp.float32) +
+        jax.random.normal(k4, (num_tokens, num_experts), dtype=jnp.float32) +
         jnp.arange(num_tokens * num_experts, dtype=jnp.float32).reshape(
             num_tokens, num_experts) / 100)
     # To generate unique top-k!
-    top_k_indices = jax.random.randint(k6, (num_tokens, top_k),
+    top_k_indices = jax.random.randint(k5, (num_tokens, top_k),
                                        minval=0,
                                        maxval=num_experts - 1,
                                        dtype=jnp.int32)
     one_hot = (jnp.sum(
         jax.nn.one_hot(top_k_indices, num_experts, dtype=jnp.float32),
         axis=1,
-    ) * 30)
+    ) * 10)
     gating_output = (gating_output + one_hot).astype(dtype)
-    return a, w1, w2, b1, b2, gating_output
-def sub_channel_quantize(x, quant_dtype, wsz=256):
-    """Quantizes x with sub-channel quantization on the 2nd minor."""
-    if jnp.issubdtype(quant_dtype, jnp.floating):
-        dtype_info = jnp.finfo(quant_dtype)
-    else:
-        dtype_info = jnp.iinfo(quant_dtype)
-    dtype_max = float(dtype_info.max)
-    w_lst, scale_lst = [], []
-    assert len(x.shape) >= 2
-    assert x.shape[-2] % wsz == 0
-    for i in range(0, x.shape[-2], wsz):
-        y = x[..., i:i + wsz, :]
-        abs_max = jnp.abs(y).max(axis=-2, keepdims=True)
-        scale = (abs_max / dtype_max).astype(jnp.float32)
-        w = (y / scale).astype(quant_dtype)
-        w_lst.append(w)
-        scale_lst.append(scale)
-    return jnp.concat(w_lst, axis=-2), jnp.concat(scale_lst, axis=-2)
+    return a, w1, w2, gating_output
 @jtu.with_config(jax_numpy_dtype_promotion="standard")
@@ -108,266 +63,42 @@ class MoEKernelTest(jtu.JaxTestCase):
         self.mesh = Mesh(np.array(self.mesh_devices).reshape(1, -1),
                          axis_names=("data", "model"))
-    def _test_moe(
-        self,
-        dtype,
-        top_k,
-        num_experts,
-        hidden_size,
-        intermediate_size,
-        num_tokens,
-        seed,
-        renormalize_topk_logits,
-        bt,
-        bf,
-        bd1,
-        bd2,
-        btc,
-        bfc,
-        bd1c,
-        bd2c,
-        act_fn="silu",
-        w_dtype=None,
-        subc_quant_wsz=None,
-        has_bias=False,
-        atol=2e-1,
-        rtol=2e-1,
-    ):
-        a, w1, w2, b1, b2, gating_output = gen_moe_inputs(
+    def test_basic(self):
+        dtype = jnp.bfloat16
+        top_k = 2
+        num_experts = 16
+        hidden_size = 256
+        intermediate_size = 256
+        num_tokens = 8 * 2
+        a, w1, w2, gating_output = gen_moe_inputs(
             dtype,
             top_k,
             num_experts,
             hidden_size,
             intermediate_size,
             num_tokens,
-            seed=seed,
-            has_bias=has_bias,
         )
-        w1_scale = None
-        w2_scale = None
-        if w_dtype is not None:
-            if subc_quant_wsz is None:
-                subc_quant_wsz = 256
-            w1, w1_scale = sub_channel_quantize(w1, w_dtype, subc_quant_wsz)
-            w2, w2_scale = sub_channel_quantize(w2, w_dtype, subc_quant_wsz)
-        actual = fused_ep_moe(
-            mesh=self.mesh,
-            tokens=a,
-            w1=w1,
-            w2=w2,
-            gating_output=gating_output,
-            top_k=top_k,
-            renormalize_topk_logits=renormalize_topk_logits,
-            act_fn=act_fn,
-            subc_quant_wsz=subc_quant_wsz,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            b1=b1,
-            b2=b2,
-            bt=bt,
-            bf=bf,
-            bd1=bd1,
-            bd2=bd2,
-            btc=btc,
-            bfc=bfc,
-            bd1c=bd1c,
-            bd2c=bd2c,
-        )
-        expected = ref_moe(
-            a,
-            w1,
-            w2,
-            gating_output,
-            top_k,
-            b1=b1,
-            b2=b2,
-            renormalize_topk_logits=renormalize_topk_logits,
-            activation=act_fn,
-            subc_quant_wsz=subc_quant_wsz,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-        )
-        self.assertAllClose(actual, expected, atol=atol, rtol=rtol)
-    @parameterized.product(renormalize_topk_logits=[True, False], )
-    def test_basic(self, renormalize_topk_logits):
-        dtype = jnp.bfloat16
-        top_k = 8
-        num_experts = 128
-        hidden_size = 1024
-        intermediate_size = 1024
-        num_tokens = 8 * 32
-        self._test_moe(
-            dtype=dtype,
-            top_k=top_k,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_tokens=num_tokens,
-            seed=1234,
-            renormalize_topk_logits=renormalize_topk_logits,
-            bt=32,
-            bf=1024,
-            bd1=1024,
-            bd2=1024,
-            btc=32,
-            bfc=256,
-            bd1c=256,
-            bd2c=256,
-        )
-    @parameterized.product(act_fn=["silu", "gelu", "swigluoai"], )
-    def test_activation(self, act_fn):
-        dtype = jnp.bfloat16
-        top_k = 8
-        num_experts = 128
-        hidden_size = 1024
-        intermediate_size = 1024
-        num_tokens = 8 * 32
-        self._test_moe(
-            dtype=dtype,
-            top_k=top_k,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_tokens=num_tokens,
-            seed=1234,
-            renormalize_topk_logits=True,
-            act_fn=act_fn,
-            bt=32,
-            bf=512,
-            bd1=512,
-            bd2=512,
-            btc=32,
-            bfc=256,
-            bd1c=256,
-            bd2c=256,
-        )
-    def test_benchmark_qwen_235(self):
-        num_experts = 128
-        top_k = 8
-        hidden_size = 4096
-        intermediate_size = 1536
-        dtype = jnp.bfloat16
-        num_tokens = 8 * 64
-        seed = 54321
-        renormalize_topk_logits = True
-        self._test_moe(
-            dtype=dtype,
-            top_k=top_k,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_tokens=num_tokens,
-            seed=seed,
-            renormalize_topk_logits=renormalize_topk_logits,
-            bt=64,
-            bf=768,
-            bd1=2048,
-            bd2=2048,
-            btc=64,
-            bfc=768,
-            bd1c=2048,
-            bd2c=2048,
-            act_fn="silu",
-            atol=5e-2,
-            rtol=5e-2,
-        )
-    def test_benchmark_qwen_30b_a3b(self):
-        num_experts = 128
-        top_k = 8
-        hidden_size = 2048
-        intermediate_size = 768
-        dtype = jnp.bfloat16
-        num_tokens = 512
-        seed = 54321
-        renormalize_topk_logits = True
-        self._test_moe(
-            dtype=dtype,
-            top_k=top_k,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_tokens=num_tokens,
-            seed=seed,
-            renormalize_topk_logits=renormalize_topk_logits,
-            bt=16,
-            bf=384,
-            bd1=512,
-            bd2=512,
-            btc=16,
-            bfc=384,
-            bd1c=256,
-            bd2c=256,
-            act_fn="silu",
-            atol=5e-2,
-            rtol=5e-2,
-        )
-    @parameterized.product(
-        w_dtype=[jnp.int8, jnp.float8_e5m2, jnp.float4_e2m1fn], )
-    def test_sub_channel_quantization(self, w_dtype):
-        if w_dtype in (
-                jnp.float8_e5m2,
-                jnp.float4_e2m1fn,
-        ) and not jtu.is_device_tpu_at_least(version=7):
-            self.skipTest("Expect TPUv7+")
-        dtype = jnp.bfloat16
-        top_k = 8
-        num_experts = 128
-        hidden_size = 1024
-        intermediate_size = 1024
-        num_tokens = 8 * 32
-        self._test_moe(
-            dtype=dtype,
-            top_k=top_k,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_tokens=num_tokens,
-            seed=1234,
-            renormalize_topk_logits=False,
-            w_dtype=w_dtype,
-            subc_quant_wsz=256,
-            bt=32,
-            bf=1024,
-            bd1=1024,
-            bd2=1024,
-            btc=32,
-            bfc=256,
-            bd1c=256,
-            bd2c=256,
-        )
-    def test_bias(self):
-        dtype = jnp.bfloat16
-        top_k = 8
-        num_experts = 128
-        hidden_size = 1024
-        intermediate_size = 1024
-        num_tokens = 8 * 32
-        self._test_moe(
-            dtype=dtype,
-            top_k=top_k,
-            num_experts=num_experts,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_tokens=num_tokens,
-            seed=1234,
-            renormalize_topk_logits=False,
-            has_bias=True,
-            bt=32,
-            bf=512,
-            bd1=512,
-            bd2=512,
-            btc=32,
-            bfc=256,
-            bd1c=256,
-            bd2c=256,
-        )
+        actual = jax.block_until_ready(
+            fused_ep_moe(
+                mesh=self.mesh,
+                tokens=a,
+                w1=w1,
+                w2=w2,
+                gating_output=gating_output,
+                top_k=top_k,
+                bt=32,
+                bf=512,
+                bd1=512,
+                bd2=512,
+                btc=32,
+                bfc=256,
+                bd1c=256,
+                bd2c=256,
+            ))
+        expected = ref_moe(a, w1, w2, gating_output, top_k)
+        self.assertAllClose(expected, actual, atol=2e-2, rtol=2e-2)
 if __name__ == "__main__":

tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py CHANGED Viewed

@@ -99,7 +99,7 @@ class RaggedPagedAttentionHeadDim64KernelTest(jtu.JaxTestCase):
                     (0, 0),
                     (0, 0),
                 ),
-                constant_values=0,
+                constant_values=jnp.nan,
             ).reshape(
                 -1,
                 page_size,
@@ -122,7 +122,7 @@ class RaggedPagedAttentionHeadDim64KernelTest(jtu.JaxTestCase):
             kv_cache,
             ((0, num_pages - kv_cache.shape[0]), (0, 0), (0, 0), (0, 0),
              (0, 0)),
-            constant_values=0,
+            constant_values=jnp.nan,
         )
         page_indices = jnp.stack(page_indices_list, axis=0)
         page_indices = jnp.pad(

tests/lora/test_layers.py CHANGED Viewed

@@ -91,6 +91,7 @@ def populate_loras(
     index_to_id: list[Optional[int]],
     lora_layer: BaseLayerWithLoRA,
     baselayer_weights: torch.Tensor,
+    generate_embeddings_tensor: int = 0,
     repeats: int = 1,
 ) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
     """This method populates the lora weights (lora_a and lora_b) in the lora layers (BaseLayerWithLoRA).
@@ -102,6 +103,8 @@ def populate_loras(
         lora_layer: the LoRAlayer to populate.
         baselayer_weights: the PyTorch tensor containing the layer's
             weights.
+        generate_embeddings_tensor: whether to generate an
+            embeddings tensor for each LoRA.
         repeats: must only be set for column parallel packed
             layers. Indicates the number of loras to compose
             together to create a single lora layer.
@@ -128,6 +131,7 @@ def populate_loras(
                     baselayer_weights.device).init_random_lora(
                         module_name=f"fake_{i}",
                         weight=baselayer_weights,
+                        generate_embeddings_tensor=generate_embeddings_tensor,
                     )
                 sublora.lora_b = sublora.lora_b[(sublora_len *
                                                  i):(sublora_len * (i + 1)), :]
@@ -143,6 +147,7 @@ def populate_loras(
                     slot_idx,
                     lora_a=lora.lora_a,
                     lora_b=lora.lora_b,
+                    embeddings_tensor=lora.embeddings_tensor,
                 )
             lora_dict[lora_id] = lora
@@ -541,6 +546,7 @@ def _update_punica_wrapper_metadata(punica_wrapper, index_mapping,
             index_to_id,
             lora_config.max_loras,
             vocab_size=512,
+            extra_vocab_size=lora_config.lora_extra_vocab_size,
         )
         assert jax_view(punica_wrapper._lora_indices_per_batch).platform(
         ) == 'tpu', 'punica_wrapper._lora_indices_per_batch should have been moved to TPU.'

tests/lora/utils.py CHANGED Viewed

@@ -24,6 +24,7 @@ class DummyLoRAManager:
         module_name: str,
         weight: torch.Tensor,
         rank: int = 8,
+        generate_embeddings_tensor: int = 0,
     ):
         lora = LoRALayerWeights(
             module_name,
@@ -36,6 +37,13 @@ class DummyLoRAManager:
                               dtype=weight.dtype,
                               device=self._device),
         )
+        if generate_embeddings_tensor:
+            lora.embeddings_tensor = torch.rand(
+                5,
+                generate_embeddings_tensor,
+                dtype=weight.dtype,
+                device=self._device,
+            )
         self.set_module_lora(module_name, lora)
         return lora

tests/test_utils.py CHANGED Viewed

@@ -75,34 +75,25 @@ def test_hbm_usage_bytes_pathways_enabled(mock_devices, mock_live_arrays):
     mock_device2 = MagicMock()
     devices = [mock_device1, mock_device2]
-    # Create mock addressable shards with data property
-    mock_data1_dev1 = MagicMock()
-    mock_data1_dev1.device = mock_device1
-    mock_data1_dev1.nbytes = 2000  # 2000 bytes on device1
+    # Create mock device buffers
+    mock_buffer1_dev1 = MagicMock()
+    mock_buffer1_dev1.device = mock_device1
+    mock_buffer1_dev1.nbytes = 2000  # 2000 bytes on device1
-    mock_data1_dev2 = MagicMock()
-    mock_data1_dev2.device = mock_device2
-    mock_data1_dev2.nbytes = 2000  # 2000 bytes on device2
+    mock_buffer1_dev2 = MagicMock()
+    mock_buffer1_dev2.device = mock_device2
+    mock_buffer1_dev2.nbytes = 2000  # 2000 bytes on device2
-    mock_data2_dev1 = MagicMock()
-    mock_data2_dev1.device = mock_device1
-    mock_data2_dev1.nbytes = 1000  # 1000 bytes on device1
+    mock_buffer2_dev1 = MagicMock()
+    mock_buffer2_dev1.device = mock_device1
+    mock_buffer2_dev1.nbytes = 1000  # 1000 bytes on device1
-    mock_shard1_dev1 = MagicMock()
-    mock_shard1_dev1.data = mock_data1_dev1
-    mock_shard1_dev2 = MagicMock()
-    mock_shard1_dev2.data = mock_data1_dev2
-    mock_shard2_dev1 = MagicMock()
-    mock_shard2_dev1.data = mock_data2_dev1
-    # Create mock arrays with addressable_shards
+    # Create mock arrays with device buffers
     mock_array1 = MagicMock()
-    mock_array1.addressable_shards = [mock_shard1_dev1, mock_shard1_dev2]
+    mock_array1.device_buffers = [mock_buffer1_dev1, mock_buffer1_dev2]
     mock_array2 = MagicMock()
-    mock_array2.addressable_shards = [mock_shard2_dev1]
+    mock_array2.device_buffers = [mock_buffer2_dev1]
     mock_live_arrays.return_value = [mock_array1, mock_array2]
@@ -168,7 +159,7 @@ def test_hbm_usage_bytes_pathways_no_arrays(mock_devices, mock_live_arrays):
     "head_dim, expected_padded_head_dim",
     [
         (1, 128),
-        (64, 64),
+        (64, 128),
         (127, 128),
         (128, 128),
         (129, 256),
@@ -231,5 +222,6 @@ def test_get_jax_dtype_from_str_dtype():
     assert get_jax_dtype_from_str_dtype("int8") == jnp.int8
     assert get_jax_dtype_from_str_dtype("bfloat16") == jnp.bfloat16
     assert get_jax_dtype_from_str_dtype("fp8") == jnp.float8_e4m3fn
-    assert get_jax_dtype_from_str_dtype("fp8_e4m3") == jnp.float8_e4m3fn
+    assert get_jax_dtype_from_str_dtype("fp8_e4m3") == jnp.float8_e4m3
     assert get_jax_dtype_from_str_dtype("fp8_e5m2") == jnp.float8_e5m2
+    assert get_jax_dtype_from_str_dtype("auto") is None

tpu_inference/__init__.py CHANGED Viewed

@@ -1,40 +1,21 @@
+import os
 # The environment variables override should be imported before any other
 # modules to ensure that the environment variables are set before any
 # other modules are imported.
 import tpu_inference.env_override  # noqa: F401
-from tpu_inference import envs
 from tpu_inference import tpu_info as ti
 from tpu_inference.logger import init_logger
 logger = init_logger(__name__)
-if "proxy" in envs.JAX_PLATFORMS:
+if "proxy" in os.environ.get('JAX_PLATFORMS', '').lower():
     logger.info("Running vLLM on TPU via Pathways proxy.")
     # Must run pathwaysutils.initialize() before any JAX operations
     try:
-        import traceback
         import pathwaysutils
-        import vllm
-        from vllm.platforms import (resolve_current_platform_cls_qualname,
-                                    resolve_obj_by_qualname)
         pathwaysutils.initialize()
         logger.info("Module pathwaysutils is imported.")
-        # Pathways requires eager resolution of vllm.current_platform instead of
-        # lazy resolution in the normal code path. Since this part involves
-        # global topology discovery across multiple hosts, the platform
-        # resolution must happen before other components are loaded.
-        logger.info("Eagerly resolving vLLM current_platform for Pathways.")
-        platform_cls_qualname = resolve_current_platform_cls_qualname()
-        resolved_platform_instance = resolve_obj_by_qualname(
-            platform_cls_qualname)()
-        vllm.platforms._current_platform = resolved_platform_instance
-        vllm.platforms._init_trace = "".join(traceback.format_stack())
-        logger.info(
-            f"vLLM platform resolved to: {resolved_platform_instance.__class__.__name__}"
-        )
     except Exception as e:
         logger.error(
             f"Error occurred while importing pathwaysutils or logging TPU info: {e}"

tpu_inference/core/core_tpu.py CHANGED Viewed

@@ -29,7 +29,6 @@ from vllm.v1.request import Request, RequestStatus
 from tpu_inference import utils as common_utils
 from tpu_inference.core import disagg_executor, disagg_utils
-from tpu_inference.runner.tpu_runner import AsyncTPUModelRunnerOutput
 # ======================================================================================
 # Imports for _DisaggOrchestrator (decoupled from vLLM)
 # ======================================================================================
@@ -187,8 +186,6 @@ class _DisaggOrchestrator:
                     if model_output is None:
                         model_output = prefill_engine.model_executor.sample_tokens(
                             grammar_output)
-                    if isinstance(model_output, AsyncTPUModelRunnerOutput):
-                        model_output = model_output.get_output()
             if scheduler_output.total_num_scheduled_tokens > 0:
                 logger.debug(f"Prefill result: {model_output}")
@@ -221,16 +218,15 @@ class _DisaggOrchestrator:
                             f"request-{req_id}: tokens={request.all_token_ids} after prefill"
                         )
                         # Remove request from the prefill engine.
-                        if req_id in prefill_engine.scheduler.requests:
-                            request = prefill_engine.scheduler.requests[req_id]
-                            prefill_engine.scheduler.running.remove(request)
-                            prefill_engine.scheduler.encoder_cache_manager.free(
-                                request)
-                            prefill_engine.scheduler.kv_cache_manager.free(
-                                request)
+                        request = prefill_engine.scheduler.requests[req_id]
+                        prefill_engine.scheduler.running.remove(request)
+                        prefill_engine.scheduler.encoder_cache_manager.free(
+                            request)
-                            prefill_engine.scheduler.requests.pop(req_id)
+                        prefill_engine.scheduler.kv_cache_manager.free(request)
+                        prefill_engine.scheduler.requests.pop(req_id)
                 for output in (engine_core_outputs.items()
                                if engine_core_outputs else ()):
@@ -339,10 +335,8 @@ class _DisaggOrchestrator:
                 new_block_ids = kv_cache_manager.get_block_ids(req_id)
                 logger.debug(
                     f"inserting {req_id} new_block_ids {new_block_ids}")
-                if len(new_block_ids[0]) != math.ceil(
-                        prompt_tokens / self._config.cache_config.block_size):
-                    logger.warning("Running out of blocks in decode engine! ")
-                    break
+                assert (len(new_block_ids[0]) == math.ceil(
+                    prompt_tokens / self._config.cache_config.block_size))
                 decode_engine.model_executor.driver_worker.model_runner.insert_request_with_kv_cache(
                     vllm_request, kv_cache, new_block_ids)
@@ -372,8 +366,6 @@ class _DisaggOrchestrator:
                     if model_output is None:
                         model_output = decode_engine.model_executor.sample_tokens(
                             grammar_output)
-                    if isinstance(model_output, AsyncTPUModelRunnerOutput):
-                        model_output = model_output.get_output()
             if scheduler_output.total_num_scheduled_tokens > 0:
                 logger.debug(f"Decode result: {model_output}")

tpu_inference/core/disagg_utils.py CHANGED Viewed

@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
+import os
 from typing import Tuple
-from tpu_inference import envs
+PREFILL_SLICES = 'PREFILL_SLICES'
+DECODE_SLICES = 'DECODE_SLICES'
 def is_disagg_enabled() -> bool:
     # We triggrer our code path as long as prefill slices are set. This
     # allows us to test interleave mode effectively with the code path
     # for comparison purposes.
-    return bool(envs.PREFILL_SLICES)
+    return PREFILL_SLICES in os.environ
 def _parse_slices(slices_str: str) -> Tuple[int, ...]:
@@ -38,12 +40,12 @@ def _parse_slices(slices_str: str) -> Tuple[int, ...]:
 def get_prefill_slices() -> Tuple[int, ...]:
-    if not envs.PREFILL_SLICES:
+    if PREFILL_SLICES not in os.environ:
         return ()
-    return _parse_slices(envs.PREFILL_SLICES)
+    return _parse_slices(os.environ[PREFILL_SLICES])
 def get_decode_slices() -> Tuple[int, ...]:
-    if not envs.DECODE_SLICES:
+    if DECODE_SLICES not in os.environ:
         return ()
-    return _parse_slices(envs.DECODE_SLICES)
+    return _parse_slices(os.environ[DECODE_SLICES])

tpu-inference 0.0.1rc1__py3-none-any.whl → 0.11.1.dev202511130813__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.0.1rc1py3-none-any.whl → 0.11.1.dev202511130813py3-none-any.whl