PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251219__tar.gz → 0.12.0rc1__tar.gz - Mend

tpu-inference 0.12.0.dev20251219tar.gz → 0.12.0rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (193) hide show

{tpu_inference-0.12.0.dev20251219/tpu_inference.egg-info → tpu_inference-0.12.0rc1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tpu_inference
-Version: 0.12.0.dev20251219
+Version: 0.12.0rc1
 Author: tpu_inference Contributors
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
@@ -25,7 +25,7 @@ Requires-Dist: jax[tpu]==0.8.0
 Requires-Dist: jaxlib==0.8.0
 Requires-Dist: jaxtyping
 Requires-Dist: flax==0.11.1
-Requires-Dist: torchax==0.0.10
+Requires-Dist: torchax==0.0.7
 Requires-Dist: qwix==0.1.1
 Requires-Dist: torchvision==0.24.0
 Requires-Dist: pathwaysutils
@@ -53,11 +53,13 @@ Dynamic: requires-python
 ---
-_Latest News_ 🔥
+_Upcoming Events_ 🔥
+- Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) in San Francisco!
+- Join us at [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco!
+- Join us at [JAX DevLab on November 18th](https://rsvp.withgoogle.com/events/devlab-fall-2025) in Sunnyvale!
-- [Pytorch Conference](https://pytorchconference.sched.com/event/27QCh/sponsored-session-everything-everywhere-all-at-once-vllm-hardware-optionality-with-spotify-and-google-brittany-rockwell-google-shireen-kheradpey-spotify) Learn how Spotify uses vLLM with both GPUs and TPUs to drive down costs and improve user experience.
-- Check back soon for a recording of our session at [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco!
-- Check back soon for a recording of our session at [JAX DevLab on November 18th](https://rsvp.withgoogle.com/events/devlab-fall-2025) in Sunnyvale!
+_Latest News_ 🔥
 - [2025/10] [vLLM TPU: A New Unified Backend Supporting PyTorch and JAX on TPU](https://blog.vllm.ai/2025/10/16/vllm-tpu.html)

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/README.md RENAMED Viewed

@@ -11,11 +11,13 @@
 ---
-_Latest News_ 🔥
+_Upcoming Events_ 🔥
+- Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) in San Francisco!
+- Join us at [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco!
+- Join us at [JAX DevLab on November 18th](https://rsvp.withgoogle.com/events/devlab-fall-2025) in Sunnyvale!
-- [Pytorch Conference](https://pytorchconference.sched.com/event/27QCh/sponsored-session-everything-everywhere-all-at-once-vllm-hardware-optionality-with-spotify-and-google-brittany-rockwell-google-shireen-kheradpey-spotify) Learn how Spotify uses vLLM with both GPUs and TPUs to drive down costs and improve user experience.
-- Check back soon for a recording of our session at [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco!
-- Check back soon for a recording of our session at [JAX DevLab on November 18th](https://rsvp.withgoogle.com/events/devlab-fall-2025) in Sunnyvale!
+_Latest News_ 🔥
 - [2025/10] [vLLM TPU: A New Unified Backend Supporting PyTorch and JAX on TPU](https://blog.vllm.ai/2025/10/16/vllm-tpu.html)

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/requirements.txt RENAMED Viewed

@@ -9,7 +9,7 @@ jax[tpu]==0.8.0
 jaxlib==0.8.0
 jaxtyping
 flax==0.11.1
-torchax==0.0.10
+torchax==0.0.7
 qwix==0.1.1
 torchvision==0.24.0
 pathwaysutils

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tests/kernels/mla_v1_test.py RENAMED Viewed

@@ -42,7 +42,6 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
         padded_r_dim = align_to(r_dim, 128)
         padded_lkv_dim = align_to(lkv_dim, 128)
-        padded_kv_dim = padded_lkv_dim + padded_r_dim
         packing = get_dtype_packing(kv_dtype)
         q_lens = [s[0] for s in seq_lens]
         kv_lens_list = [s[1] for s in seq_lens]
@@ -70,10 +69,13 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
         new_kv_c = gen_random((total_q_len, lkv_dim), kv_dtype)
         new_k_pe = gen_random((total_q_len, r_dim), kv_dtype)
-        cache_kv = gen_random(
-            (total_num_pages, page_size // packing, packing, padded_kv_dim),
+        cache_kv_c = gen_random(
+            (total_num_pages, page_size // packing, packing, padded_lkv_dim),
             kv_dtype,
         )
+        cache_k_pe = gen_random(
+            (total_num_pages, page_size // packing, packing, padded_r_dim),
+            kv_dtype)
         kv_lens = jnp.array(kv_lens_list, dtype=jnp.int32)
         page_indices = jnp.array(page_indices_list, dtype=jnp.int32)
         cu_q_lens = jnp.array(cu_q_lens_list, dtype=jnp.int32)
@@ -82,13 +84,14 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
         ql_nope_for_kernel = ql_nope.copy()
         q_pe_for_kernel = q_pe.copy()
-        expected_out, expected_updated_kv = (
+        expected_out, expected_updated_kv_c, expeceted_updated_k_pe = (
             mla.ref_mla_ragged_paged_attention(
                 ql_nope,
                 q_pe,
                 new_kv_c,
                 new_k_pe,
-                cache_kv.copy(),
+                cache_kv_c.copy(),
+                cache_k_pe.copy(),
                 kv_lens,
                 page_indices,
                 cu_q_lens,
@@ -98,140 +101,49 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
                 soft_cap=soft_cap,
             ))
-        kernel_out, kernel_updated_kv = (mla.mla_ragged_paged_attention(
-            ql_nope_for_kernel,
-            q_pe_for_kernel,
-            new_kv_c,
-            new_k_pe,
-            cache_kv.copy(),
-            kv_lens,
-            page_indices,
-            cu_q_lens,
-            distribution,
-            sm_scale=sm_scale,
-            sliding_window=sliding_window,
-            soft_cap=soft_cap,
-            num_kv_pages_per_block=num_kv_pages_per_block,
-            num_queries_per_block=num_queries_per_block,
-            vmem_limit_bytes=vmem_limit_bytes,
-        ))
+        kernel_out, kernel_updated_kv_c, kernel_updated_k_pe = (
+            mla.mla_ragged_paged_attention(
+                ql_nope_for_kernel,
+                q_pe_for_kernel,
+                new_kv_c,
+                new_k_pe,
+                cache_kv_c.copy(),
+                cache_k_pe.copy(),
+                kv_lens,
+                page_indices,
+                cu_q_lens,
+                distribution,
+                sm_scale=sm_scale,
+                sliding_window=sliding_window,
+                soft_cap=soft_cap,
+                num_kv_pages_per_block=num_kv_pages_per_block,
+                num_queries_per_block=num_queries_per_block,
+                vmem_limit_bytes=vmem_limit_bytes,
+            ))
         self.assertEqual(expected_out.shape,
                          (total_q_len, num_heads, padded_lkv_dim))
         self.assertEqual(
-            expected_updated_kv.shape,
-            (total_num_pages, page_size // packing, packing, padded_kv_dim),
+            expected_updated_kv_c.shape,
+            (total_num_pages, page_size // packing, packing, padded_lkv_dim),
+        )
+        self.assertEqual(
+            expeceted_updated_k_pe.shape,
+            (total_num_pages, page_size // packing, packing, padded_r_dim),
         )
         self.assertEqual(expected_out.dtype, kv_dtype)
-        self.assertEqual(expected_updated_kv.dtype, kv_dtype)
+        self.assertEqual(expected_updated_kv_c.dtype, kv_dtype)
+        self.assertEqual(expeceted_updated_k_pe.dtype, kv_dtype)
         self.assertAllClose(expected_out, kernel_out, atol=0.2, rtol=0.2)
-        self.assertAllClose(expected_updated_kv,
-                            kernel_updated_kv,
+        self.assertAllClose(expected_updated_kv_c,
+                            kernel_updated_kv_c,
+                            atol=0.2,
+                            rtol=0.2)
+        self.assertAllClose(expeceted_updated_k_pe,
+                            kernel_updated_k_pe,
                             atol=0.2,
                             rtol=0.2)
-    def test_update_kv_cache(self):
-        lkv_dim = 4
-        r_dim = 4
-        padded_lkv_dim = align_to(lkv_dim, 128)
-        padded_r_dim = align_to(r_dim, 128)
-        kv_dtype = jnp.bfloat16
-        new_kv_c = jnp.arange(16, dtype=kv_dtype).reshape((4, lkv_dim))
-        new_k_pe = (jnp.arange(16, dtype=kv_dtype).reshape((4, r_dim)) + 100)
-        total_num_pages = 2
-        page_size = 4
-        cache_kv_shape = mla.get_kv_cache_shape(
-            total_num_pages,
-            page_size,
-            padded_lkv_dim + padded_r_dim,
-            kv_dtype,
-        )
-        cache_kv = jnp.zeros(cache_kv_shape, dtype=kv_dtype)
-        # two sequences, first with 3 tokens, second with 1 token
-        kv_lens = jnp.array([3, 1], dtype=jnp.int32)
-        # first seq uses page 0, second uses page 1
-        page_indices = jnp.array([0, -1, 1, -1], dtype=jnp.int32)
-        # three tokens for first seq, one for second
-        cu_q_lens = jnp.array([0, 3, 4], dtype=jnp.int32)
-        distribution = jnp.array([0, 0, 2], dtype=jnp.int32)
-        # manually compute the expected cache
-        padded_new_kv_c = jnp.pad(new_kv_c,
-                                  ((0, 0), (0, padded_lkv_dim - lkv_dim)),
-                                  constant_values=0)
-        padded_new_k_pe = jnp.pad(new_k_pe,
-                                  ((0, 0), (0, padded_r_dim - r_dim)),
-                                  constant_values=0)
-        expected_cache = cache_kv
-        # First sequence
-        # token 0
-        page_idx, row, col = 0, 0, 0
-        expected_cache = expected_cache.at[page_idx, row,
-                                           col, :padded_lkv_dim].set(
-                                               padded_new_kv_c[0])
-        expected_cache = expected_cache.at[page_idx, row, col,
-                                           padded_lkv_dim:padded_lkv_dim +
-                                           padded_r_dim].set(
-                                               padded_new_k_pe[0])
-        # token 1
-        page_idx, row, col = 0, 0, 1
-        expected_cache = expected_cache.at[page_idx, row,
-                                           col, :padded_lkv_dim].set(
-                                               padded_new_kv_c[1])
-        expected_cache = expected_cache.at[page_idx, row, col,
-                                           padded_lkv_dim:padded_lkv_dim +
-                                           padded_r_dim].set(
-                                               padded_new_k_pe[1])
-        # token 2
-        page_idx, row, col = 0, 1, 0
-        expected_cache = expected_cache.at[page_idx, row,
-                                           col, :padded_lkv_dim].set(
-                                               padded_new_kv_c[2])
-        expected_cache = expected_cache.at[page_idx, row, col,
-                                           padded_lkv_dim:padded_lkv_dim +
-                                           padded_r_dim].set(
-                                               padded_new_k_pe[2])
-        # Second sequence
-        # token 0
-        page_idx, row, col = 1, 0, 0
-        expected_cache = expected_cache.at[page_idx, row,
-                                           col, :padded_lkv_dim].set(
-                                               padded_new_kv_c[3])
-        expected_cache = expected_cache.at[page_idx, row, col,
-                                           padded_lkv_dim:padded_lkv_dim +
-                                           padded_r_dim].set(
-                                               padded_new_k_pe[3])
-        updated_cache = mla.update_kv_cache(
-            new_kv_c,
-            new_k_pe,
-            cache_kv,
-            kv_lens,
-            page_indices,
-            cu_q_lens,
-            distribution,
-        )
-        self.assertAllClose(updated_cache, expected_cache)
-    def test_get_kv_cache_shape(self):
-        total_num_pages = 10
-        page_size = 16
-        lkv_dim = 128
-        kv_dtype = jnp.bfloat16
-        # The calculation for the expected shape is as follows:
-        # kv_packing is determined by the dtype, which is 2 for bfloat16.
-        # The second dimension is page_size / kv_packing = 16 / 2 = 8
-        # The third dimension is kv_packing = 2
-        # The fourth dimension is lkv_dim aligned to 128, which is 128
-        expected_shape = (10, 8, 2, 128)
-        self.assertEqual(
-            mla.get_kv_cache_shape(total_num_pages, page_size, lkv_dim,
-                                   kv_dtype), expected_shape)
     def test_ragged_paged_attention_basic(self):
         dtype = jnp.bfloat16

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tests/kernels/quantized_matmul_kernel_test.py RENAMED Viewed

@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+import functools
 import jax
 import jax.numpy as jnp
 from absl.testing import absltest, parameterized
@@ -8,7 +10,6 @@ from jax._src import test_util as jtu
 from tpu_inference.kernels.quantized_matmul import (kernel, tuned_block_sizes,
                                                     util)
-xla_quantized_matmul = kernel.xla_quantized_matmul
 quantized_matmul_kernel = kernel.quantized_matmul_kernel
 quantize_tensor = util.quantize_tensor
 get_tuned_block_sizes = tuned_block_sizes.get_tuned_block_sizes
@@ -16,6 +17,37 @@ get_tuned_block_sizes = tuned_block_sizes.get_tuned_block_sizes
 jax.config.parse_flags_with_absl()
+@functools.partial(jax.jit, static_argnames=["quantize_activation"])
+def reference_quantized_matmul(
+    x: jax.Array,
+    w_q: jax.Array,
+    w_scale: jax.Array,
+    quantize_activation=True,
+):
+    if quantize_activation:
+        acc_dtype = jnp.float32
+        if quantize_activation and jnp.issubdtype(w_q.dtype, jnp.integer):
+            acc_dtype = jnp.int32
+        x_q, x_scale = quantize_tensor(x, w_q.dtype)
+        out = jax.lax.dot_general(
+            x_q,
+            w_q,
+            dimension_numbers=(((1, ), (1, )), ((), ())),
+            preferred_element_type=acc_dtype,
+        ).astype(jnp.float32)
+        out *= x_scale
+    else:
+        out = jax.lax.dot_general(
+            x,
+            w_q,
+            dimension_numbers=(((1, ), (1, )), ((), ())),
+            preferred_element_type=jnp.float32,
+        )
+    out *= jnp.expand_dims(w_scale, 0)
+    return out.astype(x.dtype)
 @jtu.with_config(jax_numpy_dtype_promotion="standard")
 class QuantizedMatmulKernelTest(jtu.JaxTestCase):
@@ -62,7 +94,7 @@ class QuantizedMatmulKernelTest(jtu.JaxTestCase):
             x_q_dtype=x_q_dtype,
             tuned_value=tuned_value,
         )
-        expected = xla_quantized_matmul(
+        expected = reference_quantized_matmul(
             x, w_q, w_scale, quantize_activation=quantize_activation)
         self.assertAllClose(output,

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py RENAMED Viewed

@@ -176,9 +176,7 @@ class RaggedPagedAttentionHeadDim64KernelTest(jtu.JaxTestCase):
         )
         output = output[:cu_q_lens[distribution[-1]]]
-        dtype_bits = (dtypes.bit_width(jnp.dtype(kv_dtype)) if hasattr(
-            dtypes, "bit_width") else dtypes.itemsize_bits(
-                jnp.dtype(kv_dtype)))
+        dtype_bits = dtypes.bit_width(jnp.dtype(kv_dtype))
         tols = {
             32: 0.15,
             16: 0.2,

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_paged_attention_kernel_v3_test.py RENAMED Viewed

@@ -162,9 +162,7 @@ class RaggedPagedAttentionKernelTest(jtu.JaxTestCase):
         )
         output = output[:cu_q_lens[distribution[-1]]]
-        dtype_bits = (dtypes.bit_width(jnp.dtype(kv_dtype)) if hasattr(
-            dtypes, "bit_width") else dtypes.itemsize_bits(
-                jnp.dtype(kv_dtype)))
+        dtype_bits = dtypes.bit_width(jnp.dtype(kv_dtype))
         tols = {
             32: 0.15,
             16: 0.2,

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tests/lora/test_layers.py RENAMED Viewed

@@ -18,7 +18,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
                               ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA)
 # yapf: enable
-from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -499,13 +499,9 @@ def _create_random_linear_parallel_layer(layer_type, vllm_config, mesh):
     return linear, lora_linear
-def _get_devices():
-    return jax.devices()
 def _create_mesh():
     axis_names = ("data", "model")
-    devices = _get_devices()
+    devices = jax.devices()
     mesh_shape = (1, len(devices))
     mesh = jax.make_mesh(mesh_shape, axis_names, devices=devices)
     return mesh
@@ -517,7 +513,7 @@ def _verify_lora_linear_layer(linear, lora_linear):
         # BaseLinearLayerWithLoRA.weight property guarantees this.
         # if len(devices) != 1, `reorder_concatenated_tensor_for_sharding` function may reorder the out_features dimension of the weight matrix.
         # So the below check will fail.
-        if len(_get_devices()) == 1:
+        if len(jax.devices()) == 1:
             assert torch.equal(linear.weight.data,
                                lora_linear.weight.to('cpu'))

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tests/lora/test_lora.py RENAMED Viewed

@@ -29,7 +29,7 @@ def setup_vllm(num_loras: int, tp: int = 1) -> vllm.LLM:
 # For multi-chip test, we only use TP=2 because the base model Qwen/Qwen2.5-3B-Instruct has 2 kv heads and the current attention kernel requires it to be divisible by tp_size.
-TP = [2] if os.environ.get("TEST_LORA_TP", False) else [1]
+TP = [2] if os.environ.get("USE_V6E8_QUEUE", False) else [1]
 @pytest.mark.parametrize("tp", TP)

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tests/test_envs.py RENAMED Viewed

@@ -60,7 +60,6 @@ def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "0")
     monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "0")
     monkeypatch.setenv("NEW_MODEL_DESIGN", "0")
-    monkeypatch.setenv("ENABLE_QUANTIZED_MATMUL_KERNEL", "0")
     monkeypatch.setenv("USE_MOE_EP_KERNEL", "0")
     # Test SKIP_JAX_PRECOMPILE (default False)
@@ -87,82 +86,6 @@ def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("USE_MOE_EP_KERNEL", "1")
     assert envs.USE_MOE_EP_KERNEL is True
-    # Test ENABLE_QUANTIZED_MATMUL_KERNEL (default False)
-    assert envs.ENABLE_QUANTIZED_MATMUL_KERNEL is False
-    monkeypatch.setenv("ENABLE_QUANTIZED_MATMUL_KERNEL", "1")
-    assert envs.ENABLE_QUANTIZED_MATMUL_KERNEL is True
-def test_boolean_env_vars_string_values(monkeypatch: pytest.MonkeyPatch):
-    """Test that boolean env vars accept string values like 'True' and 'False'"""
-    # Test NEW_MODEL_DESIGN with string "True"
-    monkeypatch.setenv("NEW_MODEL_DESIGN", "True")
-    assert envs.NEW_MODEL_DESIGN is True
-    monkeypatch.setenv("NEW_MODEL_DESIGN", "true")
-    assert envs.NEW_MODEL_DESIGN is True
-    monkeypatch.setenv("NEW_MODEL_DESIGN", "False")
-    assert envs.NEW_MODEL_DESIGN is False
-    monkeypatch.setenv("NEW_MODEL_DESIGN", "false")
-    assert envs.NEW_MODEL_DESIGN is False
-    # Test SKIP_JAX_PRECOMPILE with string values
-    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "True")
-    assert envs.SKIP_JAX_PRECOMPILE is True
-    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "false")
-    assert envs.SKIP_JAX_PRECOMPILE is False
-    # Test VLLM_XLA_CHECK_RECOMPILATION with string values
-    monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "TRUE")
-    assert envs.VLLM_XLA_CHECK_RECOMPILATION is True
-    monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "FALSE")
-    assert envs.VLLM_XLA_CHECK_RECOMPILATION is False
-    # Test USE_MOE_EP_KERNEL with string values
-    monkeypatch.setenv("USE_MOE_EP_KERNEL", "true")
-    assert envs.USE_MOE_EP_KERNEL is True
-    monkeypatch.setenv("USE_MOE_EP_KERNEL", "False")
-    assert envs.USE_MOE_EP_KERNEL is False
-def test_boolean_env_vars_invalid_values(monkeypatch: pytest.MonkeyPatch):
-    """Test that boolean env vars raise errors for invalid values"""
-    # Test invalid value for NEW_MODEL_DESIGN
-    monkeypatch.setenv("NEW_MODEL_DESIGN", "yes")
-    with pytest.raises(
-            ValueError,
-            match="Invalid boolean value 'yes' for NEW_MODEL_DESIGN"):
-        _ = envs.NEW_MODEL_DESIGN
-    monkeypatch.setenv("NEW_MODEL_DESIGN", "2")
-    with pytest.raises(ValueError,
-                       match="Invalid boolean value '2' for NEW_MODEL_DESIGN"):
-        _ = envs.NEW_MODEL_DESIGN
-    # Test invalid value for SKIP_JAX_PRECOMPILE
-    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "invalid")
-    with pytest.raises(
-            ValueError,
-            match="Invalid boolean value 'invalid' for SKIP_JAX_PRECOMPILE"):
-        _ = envs.SKIP_JAX_PRECOMPILE
-def test_boolean_env_vars_empty_string(monkeypatch: pytest.MonkeyPatch):
-    """Test that empty string returns default value"""
-    monkeypatch.setenv("NEW_MODEL_DESIGN", "")
-    assert envs.NEW_MODEL_DESIGN is False  # Should return default
-    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "")
-    assert envs.SKIP_JAX_PRECOMPILE is False  # Should return default
 def test_integer_env_vars(monkeypatch: pytest.MonkeyPatch):
     # Ensure clean environment for integer vars by setting to defaults
@@ -256,7 +179,7 @@ def test_disaggregated_serving_env_vars(monkeypatch: pytest.MonkeyPatch):
 def test_model_impl_type_default(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.delenv("MODEL_IMPL_TYPE", raising=False)
-    assert envs.MODEL_IMPL_TYPE == "auto"
+    assert envs.MODEL_IMPL_TYPE == "flax_nnx"
 def test_cache_preserves_values_across_env_changes(

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tests/test_quantization.py RENAMED Viewed

@@ -112,8 +112,6 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
         self.mesh = Mesh(jax.devices(), ('model', ))
         self.rng = jax.random.PRNGKey(0)
         self.model = SimpleModel(rngs=nnx.Rngs(0))
-        self.model.vllm_config = MagicMock()
-        self.model.vllm_config.model_config.use_mla = False
         self.qwix_config = [
             {
@@ -133,7 +131,6 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
         """Test that qwix.quantize_model is called with the correct arguments."""
         quantized_model_mock = MagicMock(spec=nnx.Module)
         mock_quantize_model.return_value = quantized_model_mock
-        self.model.vllm_config.sharding_config.total_dp_size = 1
         with patch(
                 "tpu_inference.models.jax.utils.quantization.quantization_utils.init_logger",

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/tpu_connector.py RENAMED Viewed

@@ -694,9 +694,9 @@ class TPUConnectorWorker:
 def get_uuid() -> int:
     int128 = uuid4().int
-    # Must be less than 64-bit int, otherwise vllm output encoder would raise error.
-    # use 50 bit to avoid GO trunk the int when doing JSon serialization
-    return int128 >> 78
+    # Must be 64-bit int, otherwise vllm output encoder would raise error.
+    int64 = int128 >> 64
+    return int64
 @jax.jit

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tpu_inference/envs.py RENAMED Viewed

@@ -16,7 +16,7 @@ if TYPE_CHECKING:
     DECODE_SLICES: str = ""
     SKIP_JAX_PRECOMPILE: bool = False
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
-    MODEL_IMPL_TYPE: str = "auto"
+    MODEL_IMPL_TYPE: str = "flax_nnx"
     NEW_MODEL_DESIGN: bool = False
     PHASED_PROFILING_DIR: str = ""
     PYTHON_TRACER_LEVEL: int = 1
@@ -24,7 +24,6 @@ if TYPE_CHECKING:
     NUM_SLICES: int = 1
     RAY_USAGE_STATS_ENABLED: str = "0"
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "shm"
-    ENABLE_QUANTIZED_MATMUL_KERNEL: bool = False
 def env_with_choices(
@@ -70,34 +69,6 @@ def env_with_choices(
     return _get_validated_env
-def env_bool(env_name: str, default: bool = False) -> Callable[[], bool]:
-    """
-    Accepts both numeric strings ("0", "1") and boolean strings
-    ("true", "false", "True", "False").
-    Args:
-        env_name: Name of the environment variable
-        default: Default boolean value if not set
-    """
-    def _get_bool_env() -> bool:
-        value = os.getenv(env_name)
-        if value is None or value == "":
-            return default
-        value_lower = value.lower()
-        if value_lower in ("true", "1"):
-            return True
-        elif value_lower in ("false", "0"):
-            return False
-        else:
-            raise ValueError(
-                f"Invalid boolean value '{value}' for {env_name}. "
-                f"Valid options: '0', '1', 'true', 'false', 'True', 'False'.")
-    return _get_bool_env
 environment_variables: dict[str, Callable[[], Any]] = {
     # JAX platform selection (e.g., "tpu", "cpu", "proxy")
     "JAX_PLATFORMS":
@@ -122,17 +93,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: os.getenv("DECODE_SLICES", ""),
     # Skip JAX precompilation step during initialization
     "SKIP_JAX_PRECOMPILE":
-    env_bool("SKIP_JAX_PRECOMPILE", default=False),
+    lambda: bool(int(os.getenv("SKIP_JAX_PRECOMPILE") or "0")),
     # Check for XLA recompilation during execution
     "VLLM_XLA_CHECK_RECOMPILATION":
-    env_bool("VLLM_XLA_CHECK_RECOMPILATION", default=False),
+    lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION") or "0")),
     # Model implementation type (e.g., "flax_nnx")
     "MODEL_IMPL_TYPE":
-    env_with_choices("MODEL_IMPL_TYPE", "auto",
-                     ["auto", "vllm", "flax_nnx", "jetpack"]),
+    env_with_choices("MODEL_IMPL_TYPE", "flax_nnx",
+                     ["vllm", "flax_nnx", "jetpack"]),
     # Enable new experimental model design
     "NEW_MODEL_DESIGN":
-    env_bool("NEW_MODEL_DESIGN", default=False),
+    lambda: bool(int(os.getenv("NEW_MODEL_DESIGN") or "0")),
     # Directory to store phased profiling output
     "PHASED_PROFILING_DIR":
     lambda: os.getenv("PHASED_PROFILING_DIR", ""),
@@ -141,7 +112,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     lambda: int(os.getenv("PYTHON_TRACER_LEVEL") or "1"),
     # Use custom expert-parallel kernel for MoE (Mixture of Experts)
     "USE_MOE_EP_KERNEL":
-    env_bool("USE_MOE_EP_KERNEL", default=False),
+    lambda: bool(int(os.getenv("USE_MOE_EP_KERNEL") or "0")),
     # Number of TPU slices for multi-slice mesh
     "NUM_SLICES":
     lambda: int(os.getenv("NUM_SLICES") or "1"),
@@ -151,8 +122,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Ray compiled DAG channel type for TPU
     "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
     env_with_choices("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "shm", ["shm"]),
-    "ENABLE_QUANTIZED_MATMUL_KERNEL":
-    lambda: bool(int(os.getenv("ENABLE_QUANTIZED_MATMUL_KERNEL") or "0")),
 }

{tpu_inference-0.12.0.dev20251219 → tpu_inference-0.12.0rc1}/tpu_inference/executors/ray_distributed_executor.py RENAMED Viewed

@@ -145,9 +145,6 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 device_str: node['Resources'][device_str]
             } for node in ray_nodes]
         else:
-            assert pp_size == len(
-                ray_nodes
-            ), f"Cannot use PP across hosts, please set --pipeline-parallel-size to 1 or {len(ray_nodes)}"
             num_devices_per_pp_rank = self.vllm_config.sharding_config.total_devices
             placement_group_specs = [{
                 device_str: num_devices_per_pp_rank

tpu-inference 0.12.0.dev20251219__tar.gz → 0.12.0rc1__tar.gz

Potentially problematic release.

tpu-inference 0.12.0.dev20251219tar.gz → 0.12.0rc1tar.gz