PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511270815__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511270815py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (251) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +22 -1
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +167 -97
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +31 -9
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +280 -210
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +77 -36
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +91 -31
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -4
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -71
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +158 -63
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +53 -30
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +105 -57
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +65 -19
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +65 -52
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/METADATA +11 -9
tpu_inference-0.13.0rc2.post7.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511270815.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/top_level.txt +0 -0

tests/{test_quantization.py → layers/jax/test_qwix.py} RENAMED Viewed

@@ -11,9 +11,9 @@ from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
 from qwix._src.providers import ptq
-import tpu_inference.models.jax.utils.quantization.quantization_utils as quantize_qwix  # noqa: E402
+import tpu_inference.models.jax.utils.qwix.qwix_utils as quantize_qwix  # noqa: E402
 from tpu_inference.models.common.model_loader import apply_qwix_quantization
-from tpu_inference.models.jax.utils.quantization.quantization_utils import (
+from tpu_inference.models.jax.utils.qwix.qwix_utils import (
     DEFAULT_MAX_NUM_BLOCKS_PER_REQ, DEFAULT_MAX_NUM_SEQS_FOR_MODEL_INPUTS,
     DEFAULT_NUM_TOKENS_FOR_MODEL_INPUTS)
@@ -29,8 +29,7 @@ module_mocks = {
     'vllm.config': MagicMock(),
     'tpu_inference': MagicMock(),
     'tpu_inference.logger': MagicMock(init_logger=lambda name: MagicMock()),
-    'tpu_inference.models.jax.utils.quantization.quantization_utils':
-    MagicMock(),
+    'tpu_inference.models.jax.utils.qwix.qwix_utils': MagicMock(),
 }
@@ -112,6 +111,8 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
         self.mesh = Mesh(jax.devices(), ('model', ))
         self.rng = jax.random.PRNGKey(0)
         self.model = SimpleModel(rngs=nnx.Rngs(0))
+        self.model.vllm_config = MagicMock()
+        self.model.vllm_config.model_config.use_mla = False
         self.qwix_config = [
             {
@@ -131,18 +132,19 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
         """Test that qwix.quantize_model is called with the correct arguments."""
         quantized_model_mock = MagicMock(spec=nnx.Module)
         mock_quantize_model.return_value = quantized_model_mock
+        self.model.vllm_config.sharding_config.total_dp_size = 1
         with patch(
-                "tpu_inference.models.jax.utils.quantization.quantization_utils.init_logger",
+                "tpu_inference.models.jax.utils.qwix.qwix_utils.init_logger",
                 return_value=MagicMock()
         ), patch(
                 "tpu_inference.utils.hbm_usage_gb",
                 return_value=[(0.0, 0.0), (0.0, 0.0)]
         ), patch(
-                "tpu_inference.models.jax.utils.quantization.quantization_utils.create_kv_caches",
+                "tpu_inference.models.jax.utils.qwix.qwix_utils.create_kv_caches",
                 return_value=self.mock_kv_caches
         ), patch(
-                "tpu_inference.models.jax.utils.quantization.quantization_utils.quantization_config_file_path_to_dict",
+                "tpu_inference.models.jax.utils.qwix.qwix_utils.quantization_config_file_path_to_dict",
                 return_value=self.qwix_config):
             returned_model = quantize_qwix.qwix_quantize_nnx_model(
                 model=self.model,
@@ -317,10 +319,9 @@ class TestApplyQwixQuantizationLogic(unittest.TestCase):
         self.assertIs(result2, self.mock_model)
     @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.qwix_quantize_nnx_model'
+        'tpu_inference.models.jax.utils.qwix.qwix_utils.qwix_quantize_nnx_model'
     )
-    @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.utils')
+    @patch('tpu_inference.models.jax.utils.qwix.qwix_utils.utils')
     def test_apply_to_abstract_model(self, mock_utils, mock_quantize_func):
         """Test quantization is correctly applied to an abstract model factory."""
         mock_utils.get_padded_num_heads.return_value = 8
@@ -357,10 +358,9 @@ class TestApplyQwixQuantizationLogic(unittest.TestCase):
         self.assertIs(result_model, quantized_model)
     @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.qwix_quantize_nnx_model'
+        'tpu_inference.models.jax.utils.qwix.qwix_utils.qwix_quantize_nnx_model'
     )
-    @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.utils')
+    @patch('tpu_inference.models.jax.utils.qwix.qwix_utils.utils')
     def test_apply_to_abstract_model_with_initialize_cache(
             self, mock_utils, mock_quantize_func):
         """Test abstract model quantization with 'initialize_cache' method."""
@@ -461,15 +461,13 @@ class TestLoadRandomWeightsIntoQwixAbstractModel(unittest.TestCase):
         # Mock model structure
         self.model = MagicMock(spec=['weight_loader', 'initialize_cache'])
         self.model.weight_loader = MagicMock(
-            spec=['scale_dtype', 'scale_shap_map_for_random_weight_loading'])
+            spec=['scale_dtype', 'scale_shape_map_for_random_weight_loading'])
         self.model.weight_loader.scale_dtype = jnp.float16
-        self.model.weight_loader.scale_shap_map_for_random_weight_loading = {}
+        self.model.weight_loader.scale_shape_map_for_random_weight_loading = {}
+    @patch('tpu_inference.models.jax.utils.qwix.qwix_utils.nnx.iter_graph')
     @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.nnx.iter_graph'
-    )
-    @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.get_random_sharded_array'
+        'tpu_inference.models.jax.utils.qwix.qwix_utils.get_random_sharded_array'
     )
     def test_successful_initialization(self, mock_get_random_array,
                                        mock_iter_graph):
@@ -482,6 +480,10 @@ class TestLoadRandomWeightsIntoQwixAbstractModel(unittest.TestCase):
         mock_random_array = jax.numpy.ones(1)
         mock_get_random_array.return_value = mock_random_array
+        self.model.weight_loader.scale_shape_map_for_random_weight_loading = {
+            'attention.wq': (1, 1)
+        }
         mock_iter_graph.return_value = [
             (('layers', '0', 'attention', 'wq', 'kernel'), mock_weight_param),
             (('layers', '0', 'attention', 'wq', 'array', 'scale'),
@@ -509,9 +511,7 @@ class TestLoadRandomWeightsIntoQwixAbstractModel(unittest.TestCase):
             quantize_qwix.load_random_weights_into_qwix_abstract_model(
                 self.rng, self.model, self.mesh, invalid_config)
-    @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.nnx.iter_graph'
-    )
+    @patch('tpu_inference.models.jax.utils.qwix.qwix_utils.nnx.iter_graph')
     def test_param_shape_setting_no_scale_map(self, mock_iter_graph):
         """Test correct scale shape calculation when not in the map."""
         old_weight_param_val = jnp.empty((128, 64))
@@ -525,26 +525,11 @@ class TestLoadRandomWeightsIntoQwixAbstractModel(unittest.TestCase):
              mock_scale_var),
         ]
-        quantize_qwix.load_random_weights_into_qwix_abstract_model(
-            self.rng, self.model, self.mesh, self.quantization_config)
-        new_weight_param_val = mock_weight_param.value
-        new_scale_var_val = mock_scale_var.value
-        expected_scale_shape = (128 // 64, 64 // 64)
-        actual_scale_shape = new_scale_var_val.shape
-        expected_weight_shape = (128, 64)
-        actual_weight_shape = new_weight_param_val.shape
-        self.assertEqual(expected_scale_shape, actual_scale_shape)
-        self.assertEqual(expected_weight_shape, actual_weight_shape)
-        self.assertNotEqual(old_scale_var_val.shape, new_scale_var_val.shape)
-        assert jnp.not_equal(old_weight_param_val, new_weight_param_val).all()
+        with self.assertRaises(ValueError):
+            quantize_qwix.load_random_weights_into_qwix_abstract_model(
+                self.rng, self.model, self.mesh, self.quantization_config)
-    @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.nnx.iter_graph'
-    )
+    @patch('tpu_inference.models.jax.utils.qwix.qwix_utils.nnx.iter_graph')
     def test_param_shape_setting_with_scale_map(self, mock_iter_graph):
         """Test correct scale shape calculation when in the map."""
         old_weight_param_val = jnp.empty((128, 64))
@@ -554,8 +539,8 @@ class TestLoadRandomWeightsIntoQwixAbstractModel(unittest.TestCase):
         expected_scale_shape = (55, 34)
-        self.model.weight_loader.scale_shap_map_for_random_weight_loading = {
-            'wq': expected_scale_shape
+        self.model.weight_loader.scale_shape_map_for_random_weight_loading = {
+            'attention.wq': expected_scale_shape
         }
         mock_iter_graph.return_value = [
@@ -604,9 +589,7 @@ class TestLoadRandomWeightsIntoQwixAbstractModel(unittest.TestCase):
         mock_randint.assert_not_called()
         mock_normal.assert_called_once()
-    @patch(
-        "tpu_inference.models.jax.utils.quantization.quantization_utils.logger.warning"
-    )
+    @patch("tpu_inference.models.jax.utils.qwix.qwix_utils.logger.warning")
     @patch("jax.make_array_from_callback")
     def test_get_random_sharded_array_sharding_fallback(
             self, mock_make_array, mock_logger_warning):
@@ -648,7 +631,7 @@ class TestManualQwixQuantization(unittest.TestCase):
         self.calibration_method = 'max'
     @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.ptq.create_quantized_param'
+        'tpu_inference.models.jax.utils.qwix.qwix_utils.ptq.create_quantized_param'
     )
     def test_manually_quantize_qwix_weight(self, mock_create_param):
         """Test that manually_quantize_qwix_weight calls ptq.create_quantized_param correctly."""
@@ -672,9 +655,7 @@ class TestManualQwixQuantization(unittest.TestCase):
         self.assertEqual(passed_how_to_quantize.calibration_method,
                          self.calibration_method)
-    @patch(
-        'tpu_inference.models.jax.utils.quantization.quantization_utils.ptq.quantize_act'
-    )
+    @patch('tpu_inference.models.jax.utils.qwix.qwix_utils.ptq.quantize_act')
     @patch('qwix.pallas.get_current_rule')
     def test_manually_quantize_qwix_activation(self, mock_get_rule,
                                                mock_quantize_act):
@@ -832,5 +813,157 @@ class TestGetQuantDtypeFromQwixConfig(unittest.TestCase):
         self.assertIsNone(quant_dtype)
+class TestGetDefaultQwixQuantizationConfig(unittest.TestCase):
+    """Tests for the get_default_qwix_quantization_config function."""
+    def setUp(self):
+        # Mocking the default configs that the function expects to find in the module
+        self.mock_deepseek_config = {
+            "qwix": {
+                "rules": [{
+                    "module_path": ".*",
+                    "tile_size": 0
+                }]
+            }
+        }
+        self.mock_llama_config = {"qwix": {"rules": [{"name": "llama_rule"}]}}
+        self.mock_gpt_oss_config = {"qwix": {"rules": [{"name": "gpt_rule"}]}}
+        # Patch the constants in the module where the function resides
+        self.patchers = [
+            patch(
+                "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_DEEPSEEK_FP8_CONFIG",
+                self.mock_deepseek_config),
+            patch(
+                "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_LLAMA4_FP8_CONFIG",
+                self.mock_llama_config),
+            patch(
+                "tpu_inference.models.jax.utils.qwix.qwix_utils.DEFAULT_GPT_OSS_FP4_CONFIG",
+                self.mock_gpt_oss_config),
+            patch("tpu_inference.models.jax.utils.qwix.qwix_utils.logger",
+                  MagicMock())
+        ]
+        for p in self.patchers:
+            p.start()
+    def tearDown(self):
+        for p in self.patchers:
+            p.stop()
+    def test_skip_quantization_returns_none(self):
+        """Test that skip_quantization=True returns None immediately."""
+        result = quantize_qwix.get_default_qwix_quantization_config(
+            MagicMock(), True)
+        self.assertIsNone(result)
+    def test_unsupported_model_returns_none(self):
+        """Test that an unknown model type returns None."""
+        hf_config = MagicMock()
+        hf_config.model_type = "unknown_model"
+        result = quantize_qwix.get_default_qwix_quantization_config(
+            hf_config, False)
+        self.assertIsNone(result)
+    def test_deepseek_v3_success(self):
+        """Test DeepSeek V3 config with valid weight_block_size."""
+        hf_config = MagicMock()
+        hf_config.model_type = "DeepSeek_V3"
+        hf_config.quantization_config = {
+            "quant_method": "fp8",
+            "weight_block_size": [1, 128]
+        }
+        result = quantize_qwix.get_default_qwix_quantization_config(
+            hf_config, False)
+        # Check if tile_size was updated from 0 to 128
+        self.assertEqual(result["qwix"]["rules"][0]["tile_size"], 128)
+        # Ensure it's a deep copy (original mock shouldn't change)
+        self.assertEqual(
+            self.mock_deepseek_config["qwix"]["rules"][0]["tile_size"], 0)
+    def test_deepseek_v3_invalid_block_size(self):
+        """Test DeepSeek V3 raises ValueError on invalid block size format."""
+        hf_config = MagicMock()
+        hf_config.model_type = "deepseek_v3"
+        hf_config.quantization_config = {
+            "quant_method": "fp8",
+            "weight_block_size": [128]
+        }
+        with self.assertRaisesRegex(ValueError, "Invalid weight_block_size"):
+            quantize_qwix.get_default_qwix_quantization_config(
+                hf_config, False)
+    def test_deepseek_v3_invalid_block_size_2d_subchannel(self):
+        """Test DeepSeek V3 raises ValueError on invalid block size format."""
+        hf_config = MagicMock()
+        hf_config.model_type = "deepseek_v3"
+        hf_config.quantization_config = {
+            "quant_method": "fp8",
+            "weight_block_size": [512, 512]
+        }
+        with self.assertRaisesRegex(AssertionError,
+                                    "Expected first dimension to be 1"):
+            quantize_qwix.get_default_qwix_quantization_config(
+                hf_config, False)
+    def test_deepseek_v3_no_weight_block_size(self):
+        """Test DeepSeek V3 config with valid weight_block_size."""
+        hf_config = MagicMock()
+        hf_config.model_type = "DeepSeek_V3"
+        hf_config.quantization_config = {
+            "quant_method": "fp8",
+        }
+        with self.assertRaisesRegex(
+                AssertionError,
+                "Expected weight_block_size in quantization_config"):
+            quantize_qwix.get_default_qwix_quantization_config(
+                hf_config, False)
+    def test_deepseek_v3_tile_size_assertion(self):
+        """Test DeepSeek V3 raises AssertionError if tile_size is <= 1."""
+        hf_config = MagicMock()
+        hf_config.model_type = "deepseek_v3"
+        hf_config.quantization_config = {
+            "quant_method": "fp8",
+            "weight_block_size": [1, 1]
+        }
+        with self.assertRaises(AssertionError):
+            quantize_qwix.get_default_qwix_quantization_config(
+                hf_config, False)
+    def test_llama4_success(self):
+        """Test Llama 4 default config path."""
+        hf_config = MagicMock()
+        hf_config.model_type = "llama4"
+        hf_config.quantization_config = {"quant_method": "compressed-tensors"}
+        result = quantize_qwix.get_default_qwix_quantization_config(
+            hf_config, False)
+        self.assertEqual(result, self.mock_llama_config)
+    def test_gpt_oss_success(self):
+        """Test GPT-OSS default config path."""
+        hf_config = MagicMock()
+        hf_config.model_type = "gpt_oss"
+        hf_config.quantization_config = {"quant_method": "mxfp4"}
+        result = quantize_qwix.get_default_qwix_quantization_config(
+            hf_config, False)
+        self.assertEqual(result, self.mock_gpt_oss_config)
+    def test_missing_attributes_handled(self):
+        """Test that function handles hf_config objects missing model_type safely."""
+        hf_config = object()  # No attributes
+        result = quantize_qwix.get_default_qwix_quantization_config(
+            hf_config, False)
+        self.assertIsNone(result)
 if __name__ == '__main__':
     unittest.main()

tests/layers/jax/test_rope.py ADDED Viewed

@@ -0,0 +1,93 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import jax
+from jax import numpy as jnp
+from jax._src import test_util as jtu
+from jax.sharding import Mesh
+from tpu_inference.layers.jax.rope import (DeepseekScalingRotaryEmbedding,
+                                           RotaryEmbedding)
+class RotaryEmbeddingTest(jtu.JaxTestCase):
+    def test_apply_rope(self):
+        head_dim = 2
+        rope_theta = 10000
+        original_max_position_embeddings = 2
+        rope = RotaryEmbedding(
+            rotary_dim=head_dim,
+            rope_theta=rope_theta,
+            original_max_position_embeddings=original_max_position_embeddings,
+            dtype=jnp.float32)
+        rope.initialize_cache()
+        self.assertTrue(
+            rope.sin_cos_cache.shape == (original_max_position_embeddings,
+                                         head_dim))
+        expected_sin_cos = jnp.array([[1, 0], [0.5403023, 0.841471]],
+                                     dtype=jnp.float32)
+        self.assertArraysAllClose(rope.sin_cos_cache, expected_sin_cos)
+        num_tokens = 2
+        num_heads = 1
+        positions = jnp.arange(num_tokens)
+        x = jnp.ones((num_tokens, num_heads, head_dim))
+        x_rope = rope.apply_rope(positions, x)
+        expected_x_rope = jnp.array([[[1, 1]], [[-0.30116874, 1.3817732]]],
+                                    dtype=jnp.float32)
+        self.assertTrue(x_rope.shape == x.shape)
+        self.assertArraysAllClose(x_rope, expected_x_rope)
+class DeepseekScalingRotaryEmbeddingTest(jtu.JaxTestCase):
+    def test_apply_rope(self):
+        head_dim = 2
+        rope_theta = 10000
+        original_max_position_embeddings = 1
+        scaling_factor = 2
+        devices = jax.devices()
+        mesh = Mesh(devices, ('data', ))
+        rope = DeepseekScalingRotaryEmbedding(
+            rotary_dim=head_dim,
+            rope_theta=rope_theta,
+            original_max_position_embeddings=original_max_position_embeddings,
+            scaling_factor=scaling_factor,
+            dtype=jnp.float32)
+        rope.initialize_cache(mesh)
+        expected_padded_dim = 128
+        self.assertTrue(
+            rope.sin_cos_cache.shape == (scaling_factor *
+                                         original_max_position_embeddings,
+                                         expected_padded_dim))
+        valid_cache_slice = rope.sin_cos_cache[:, :head_dim]
+        expected_sin_cos = jnp.array([[1.0693147, 0], [0.5777532, 0.8997973]],
+                                     dtype=jnp.float32)
+        self.assertArraysAllClose(valid_cache_slice, expected_sin_cos)
+        num_tokens = 2
+        num_heads = 1
+        positions = jnp.arange(num_tokens)
+        x = jnp.ones((num_tokens, num_heads, head_dim))
+        x_rope = rope.apply_rope(positions, x)
+        expected_x_rope = jnp.array(
+            [[[1.0693147, 1.0693147]], [[-0.32204413, 1.4775505]]],
+            dtype=jnp.float32)
+        self.assertTrue(x_rope.shape == x.shape)
+        self.assertArraysAllClose(x_rope, expected_x_rope)

tests/layers/jax/test_sharding.py ADDED Viewed

@@ -0,0 +1,159 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest.mock import MagicMock
+import jax
+from tpu_inference.layers.common.sharding import (Sharding, ShardingConfig,
+                                                  ShardingRulesConfig,
+                                                  ShardingStrategy)
+class TestSharding(unittest.TestCase):
+    """Unit test suite for the sharding configuration logic."""
+    def setUp(self):
+        """Sets up the testing environment before each test."""
+        self.mock_devices = [MagicMock(coords=i) for i in range(8)]
+        self.original_jax_devices = jax.devices
+        jax.devices = lambda: self.mock_devices
+    def tearDown(self):
+        """Restores the original jax.devices function after tests."""
+        jax.devices = self.original_jax_devices
+    def test_sharding_strategy_init(self):
+        """Tests the initialization of the ShardingStrategy."""
+        strategy = ShardingStrategy(
+            tensor_parallelism=2,
+            expert_parallelism=4,
+            data_parallelism=1,
+            sequence_parallelism=1,
+        )
+        self.assertEqual(strategy.tensor_parallelism, 2)
+        self.assertEqual(strategy.expert_parallelism, 4)
+    def test_sharding_config_init(self):
+        """Tests the initialization of ShardingConfig."""
+        config = ShardingConfig()
+        self.assertIsInstance(config.prefill_rules, ShardingRulesConfig)
+        self.assertIsInstance(config.generate_rules, ShardingRulesConfig)
+        custom_rules = ShardingRulesConfig(activation_ffw_td=("model", None))
+        config_with_rules = ShardingConfig(prefill_rules=custom_rules)
+        self.assertEqual(config_with_rules.prefill_rules.activation_ffw_td,
+                         ("model", None))
+    def test_apply_overrides(self):
+        """Tests the _apply_overrides method for valid and invalid keys."""
+        sharding = Sharding(
+            prefill_rules={},
+            generate_rules={},
+        )
+        config_obj = ShardingRulesConfig()
+        valid_overrides = {"activation_ffw_td": ("model", None)}
+        sharding._apply_overrides(config_obj, valid_overrides)
+        self.assertEqual(config_obj.activation_ffw_td, ("model", None))
+        invalid_overrides = {"non_existent_attribute": (None, "model")}
+        with self.assertRaises(AttributeError):
+            sharding._apply_overrides(config_obj, invalid_overrides)
+    def test_default_sharding_config(self):
+        """Tests that default sharding rules are created correctly."""
+        sharding = Sharding(
+            prefill_rules={},
+            generate_rules={},
+        )
+        sharding_cfg = sharding.get_sharding_cfg()
+        generate_rules = sharding_cfg.generate_rules
+        self.assertEqual(generate_rules.ffw_weight_df, (None, "model"))
+        self.assertEqual(generate_rules.moe_router_de, (None, "model"))
+        self.assertEqual(generate_rules.attn_q_weight_dnh,
+                         (None, "model", None))
+    def test_sharding_init_with_overrides(self):
+        """Tests Sharding initialization with programmatic overrides."""
+        generate_overrides = {"logits_tv": ("data", "model")}
+        sharding = Sharding(
+            generate_rules=generate_overrides,
+            prefill_rules={},
+        )
+        sharding_cfg = sharding.get_sharding_cfg()
+        self.assertNotEqual(sharding_cfg.generate_rules.logits_tv,
+                            (None, "model"))
+        self.assertEqual(sharding_cfg.generate_rules.logits_tv,
+                         ("data", "model"))
+    def test_get_overrides_from_vllm_config(self):
+        """Tests fetching sharding overrides from a mock VllmConfig."""
+        mock_vllm_config_prefill = MagicMock()
+        mock_vllm_config_prefill.additional_config = {
+            "sharding": {
+                "logical_rules": {
+                    "all": {
+                        "norm_scale": ("model", )
+                    },
+                    "prefill": {
+                        "activation_ffw_td": ("data", "model")
+                    },
+                }
+            }
+        }
+        sharding_prefill = Sharding(
+            vllm_config=mock_vllm_config_prefill,
+            prefill_rules={},
+            generate_rules={},
+        )
+        prefill_overrides = sharding_prefill._get_overrides("prefill")
+        self.assertEqual(prefill_overrides["norm_scale"], ("model", ))
+        self.assertEqual(prefill_overrides["activation_ffw_td"],
+                         ("data", "model"))
+        mock_vllm_config_generate = MagicMock()
+        mock_vllm_config_generate.additional_config = {
+            "sharding": {
+                "logical_rules": {
+                    "all": {
+                        "norm_scale": ("model", )
+                    },
+                    "prefill": {
+                        "activation_ffw_td": ("data", "model")
+                    },
+                }
+            }
+        }
+        sharding_generate = Sharding(
+            vllm_config=mock_vllm_config_generate,
+            prefill_rules={},
+            generate_rules={},
+        )
+        generate_overrides = sharding_generate._get_overrides("generate")
+        self.assertEqual(generate_overrides["norm_scale"], ("model", ))
+        self.assertNotIn("activation_ffw_td", generate_overrides)
+if __name__ == "__main__":
+    unittest.main()

tpu-inference 0.11.1.dev202511270815__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511270815py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl