PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251207__tar.gz → 0.12.0rc1__tar.gz - Mend

tpu-inference 0.12.0.dev20251207tar.gz → 0.12.0rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (182) hide show

{tpu_inference-0.12.0.dev20251207/tpu_inference.egg-info → tpu_inference-0.12.0rc1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tpu_inference
-Version: 0.12.0.dev20251207
+Version: 0.12.0rc1
 Author: tpu_inference Contributors
 Classifier: Development Status :: 3 - Alpha
 Classifier: Intended Audience :: Developers
@@ -25,7 +25,7 @@ Requires-Dist: jax[tpu]==0.8.0
 Requires-Dist: jaxlib==0.8.0
 Requires-Dist: jaxtyping
 Requires-Dist: flax==0.11.1
-Requires-Dist: torchax==0.0.10
+Requires-Dist: torchax==0.0.7
 Requires-Dist: qwix==0.1.1
 Requires-Dist: torchvision==0.24.0
 Requires-Dist: pathwaysutils

{tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/requirements.txt RENAMED Viewed

@@ -9,7 +9,7 @@ jax[tpu]==0.8.0
 jaxlib==0.8.0
 jaxtyping
 flax==0.11.1
-torchax==0.0.10
+torchax==0.0.7
 qwix==0.1.1
 torchvision==0.24.0
 pathwaysutils

{tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/mla_v1_test.py RENAMED Viewed

@@ -42,7 +42,6 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
         padded_r_dim = align_to(r_dim, 128)
         padded_lkv_dim = align_to(lkv_dim, 128)
-        padded_kv_dim = padded_lkv_dim + padded_r_dim
         packing = get_dtype_packing(kv_dtype)
         q_lens = [s[0] for s in seq_lens]
         kv_lens_list = [s[1] for s in seq_lens]
@@ -70,10 +69,13 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
         new_kv_c = gen_random((total_q_len, lkv_dim), kv_dtype)
         new_k_pe = gen_random((total_q_len, r_dim), kv_dtype)
-        cache_kv = gen_random(
-            (total_num_pages, page_size // packing, packing, padded_kv_dim),
+        cache_kv_c = gen_random(
+            (total_num_pages, page_size // packing, packing, padded_lkv_dim),
             kv_dtype,
         )
+        cache_k_pe = gen_random(
+            (total_num_pages, page_size // packing, packing, padded_r_dim),
+            kv_dtype)
         kv_lens = jnp.array(kv_lens_list, dtype=jnp.int32)
         page_indices = jnp.array(page_indices_list, dtype=jnp.int32)
         cu_q_lens = jnp.array(cu_q_lens_list, dtype=jnp.int32)
@@ -82,13 +84,14 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
         ql_nope_for_kernel = ql_nope.copy()
         q_pe_for_kernel = q_pe.copy()
-        expected_out, expected_updated_kv = (
+        expected_out, expected_updated_kv_c, expeceted_updated_k_pe = (
             mla.ref_mla_ragged_paged_attention(
                 ql_nope,
                 q_pe,
                 new_kv_c,
                 new_k_pe,
-                cache_kv.copy(),
+                cache_kv_c.copy(),
+                cache_k_pe.copy(),
                 kv_lens,
                 page_indices,
                 cu_q_lens,
@@ -98,140 +101,49 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
                 soft_cap=soft_cap,
             ))
-        kernel_out, kernel_updated_kv = (mla.mla_ragged_paged_attention(
-            ql_nope_for_kernel,
-            q_pe_for_kernel,
-            new_kv_c,
-            new_k_pe,
-            cache_kv.copy(),
-            kv_lens,
-            page_indices,
-            cu_q_lens,
-            distribution,
-            sm_scale=sm_scale,
-            sliding_window=sliding_window,
-            soft_cap=soft_cap,
-            num_kv_pages_per_block=num_kv_pages_per_block,
-            num_queries_per_block=num_queries_per_block,
-            vmem_limit_bytes=vmem_limit_bytes,
-        ))
+        kernel_out, kernel_updated_kv_c, kernel_updated_k_pe = (
+            mla.mla_ragged_paged_attention(
+                ql_nope_for_kernel,
+                q_pe_for_kernel,
+                new_kv_c,
+                new_k_pe,
+                cache_kv_c.copy(),
+                cache_k_pe.copy(),
+                kv_lens,
+                page_indices,
+                cu_q_lens,
+                distribution,
+                sm_scale=sm_scale,
+                sliding_window=sliding_window,
+                soft_cap=soft_cap,
+                num_kv_pages_per_block=num_kv_pages_per_block,
+                num_queries_per_block=num_queries_per_block,
+                vmem_limit_bytes=vmem_limit_bytes,
+            ))
         self.assertEqual(expected_out.shape,
                          (total_q_len, num_heads, padded_lkv_dim))
         self.assertEqual(
-            expected_updated_kv.shape,
-            (total_num_pages, page_size // packing, packing, padded_kv_dim),
+            expected_updated_kv_c.shape,
+            (total_num_pages, page_size // packing, packing, padded_lkv_dim),
+        )
+        self.assertEqual(
+            expeceted_updated_k_pe.shape,
+            (total_num_pages, page_size // packing, packing, padded_r_dim),
         )
         self.assertEqual(expected_out.dtype, kv_dtype)
-        self.assertEqual(expected_updated_kv.dtype, kv_dtype)
+        self.assertEqual(expected_updated_kv_c.dtype, kv_dtype)
+        self.assertEqual(expeceted_updated_k_pe.dtype, kv_dtype)
         self.assertAllClose(expected_out, kernel_out, atol=0.2, rtol=0.2)
-        self.assertAllClose(expected_updated_kv,
-                            kernel_updated_kv,
+        self.assertAllClose(expected_updated_kv_c,
+                            kernel_updated_kv_c,
+                            atol=0.2,
+                            rtol=0.2)
+        self.assertAllClose(expeceted_updated_k_pe,
+                            kernel_updated_k_pe,
                             atol=0.2,
                             rtol=0.2)
-    def test_update_kv_cache(self):
-        lkv_dim = 4
-        r_dim = 4
-        padded_lkv_dim = align_to(lkv_dim, 128)
-        padded_r_dim = align_to(r_dim, 128)
-        kv_dtype = jnp.bfloat16
-        new_kv_c = jnp.arange(16, dtype=kv_dtype).reshape((4, lkv_dim))
-        new_k_pe = (jnp.arange(16, dtype=kv_dtype).reshape((4, r_dim)) + 100)
-        total_num_pages = 2
-        page_size = 4
-        cache_kv_shape = mla.get_kv_cache_shape(
-            total_num_pages,
-            page_size,
-            padded_lkv_dim + padded_r_dim,
-            kv_dtype,
-        )
-        cache_kv = jnp.zeros(cache_kv_shape, dtype=kv_dtype)
-        # two sequences, first with 3 tokens, second with 1 token
-        kv_lens = jnp.array([3, 1], dtype=jnp.int32)
-        # first seq uses page 0, second uses page 1
-        page_indices = jnp.array([0, -1, 1, -1], dtype=jnp.int32)
-        # three tokens for first seq, one for second
-        cu_q_lens = jnp.array([0, 3, 4], dtype=jnp.int32)
-        distribution = jnp.array([0, 0, 2], dtype=jnp.int32)
-        # manually compute the expected cache
-        padded_new_kv_c = jnp.pad(new_kv_c,
-                                  ((0, 0), (0, padded_lkv_dim - lkv_dim)),
-                                  constant_values=0)
-        padded_new_k_pe = jnp.pad(new_k_pe,
-                                  ((0, 0), (0, padded_r_dim - r_dim)),
-                                  constant_values=0)
-        expected_cache = cache_kv
-        # First sequence
-        # token 0
-        page_idx, row, col = 0, 0, 0
-        expected_cache = expected_cache.at[page_idx, row,
-                                           col, :padded_lkv_dim].set(
-                                               padded_new_kv_c[0])
-        expected_cache = expected_cache.at[page_idx, row, col,
-                                           padded_lkv_dim:padded_lkv_dim +
-                                           padded_r_dim].set(
-                                               padded_new_k_pe[0])
-        # token 1
-        page_idx, row, col = 0, 0, 1
-        expected_cache = expected_cache.at[page_idx, row,
-                                           col, :padded_lkv_dim].set(
-                                               padded_new_kv_c[1])
-        expected_cache = expected_cache.at[page_idx, row, col,
-                                           padded_lkv_dim:padded_lkv_dim +
-                                           padded_r_dim].set(
-                                               padded_new_k_pe[1])
-        # token 2
-        page_idx, row, col = 0, 1, 0
-        expected_cache = expected_cache.at[page_idx, row,
-                                           col, :padded_lkv_dim].set(
-                                               padded_new_kv_c[2])
-        expected_cache = expected_cache.at[page_idx, row, col,
-                                           padded_lkv_dim:padded_lkv_dim +
-                                           padded_r_dim].set(
-                                               padded_new_k_pe[2])
-        # Second sequence
-        # token 0
-        page_idx, row, col = 1, 0, 0
-        expected_cache = expected_cache.at[page_idx, row,
-                                           col, :padded_lkv_dim].set(
-                                               padded_new_kv_c[3])
-        expected_cache = expected_cache.at[page_idx, row, col,
-                                           padded_lkv_dim:padded_lkv_dim +
-                                           padded_r_dim].set(
-                                               padded_new_k_pe[3])
-        updated_cache = mla.update_kv_cache(
-            new_kv_c,
-            new_k_pe,
-            cache_kv,
-            kv_lens,
-            page_indices,
-            cu_q_lens,
-            distribution,
-        )
-        self.assertAllClose(updated_cache, expected_cache)
-    def test_get_kv_cache_shape(self):
-        total_num_pages = 10
-        page_size = 16
-        lkv_dim = 128
-        kv_dtype = jnp.bfloat16
-        # The calculation for the expected shape is as follows:
-        # kv_packing is determined by the dtype, which is 2 for bfloat16.
-        # The second dimension is page_size / kv_packing = 16 / 2 = 8
-        # The third dimension is kv_packing = 2
-        # The fourth dimension is lkv_dim aligned to 128, which is 128
-        expected_shape = (10, 8, 2, 128)
-        self.assertEqual(
-            mla.get_kv_cache_shape(total_num_pages, page_size, lkv_dim,
-                                   kv_dtype), expected_shape)
     def test_ragged_paged_attention_basic(self):
         dtype = jnp.bfloat16

{tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_quantization.py RENAMED Viewed

@@ -112,8 +112,6 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
         self.mesh = Mesh(jax.devices(), ('model', ))
         self.rng = jax.random.PRNGKey(0)
         self.model = SimpleModel(rngs=nnx.Rngs(0))
-        self.model.vllm_config = MagicMock()
-        self.model.vllm_config.model_config.use_mla = False
         self.qwix_config = [
             {
@@ -133,7 +131,6 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
         """Test that qwix.quantize_model is called with the correct arguments."""
         quantized_model_mock = MagicMock(spec=nnx.Module)
         mock_quantize_model.return_value = quantized_model_mock
-        self.model.vllm_config.sharding_config.total_dp_size = 1
         with patch(
                 "tpu_inference.models.jax.utils.quantization.quantization_utils.init_logger",

tpu-inference 0.12.0.dev20251207__tar.gz → 0.12.0rc1__tar.gz

Potentially problematic release.

tpu-inference 0.12.0.dev20251207tar.gz → 0.12.0rc1tar.gz