PyPI - tpu-inference - Versions diffs - 0.11.1.dev202512030818__py3-none-any.whl → 0.13.2rc3__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.2rc3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (250) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +78 -1
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +38 -7
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +17 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +95 -78
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +28 -5
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +278 -209
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +74 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +89 -26
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -3
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -64
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +72 -37
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +46 -17
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +44 -17
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +42 -36
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +63 -50
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/METADATA +7 -9
tpu_inference-0.13.2rc3.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202512030818.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202512030818.dist-info → tpu_inference-0.13.2rc3.dist-info}/top_level.txt +0 -0

tests/spec_decode/test_eagle3.py ADDED Viewed

@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest import mock
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
+                         VllmConfig)
+from vllm.config.load import LoadConfig
+from tpu_inference.layers.common.attention_metadata import AttentionMetadata
+from tpu_inference.runner import utils as runner_utils
+from tpu_inference.spec_decode.jax.eagle3 import Eagle3Proposer
+# Use a real model dir for config, but we will mock model loading/execution
+model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+def _create_proposer(
+    method: str,
+    num_speculative_tokens: int,
+) -> Eagle3Proposer:
+    model_config = ModelConfig(model=model_dir,
+                               runner="generate",
+                               max_model_len=8192,
+                               seed=42)
+    speculative_config = SpeculativeConfig(
+        target_model_config=model_config,
+        target_parallel_config=ParallelConfig(),
+        model=eagle3_dir,
+        method=method,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+    vllm_config = VllmConfig(model_config=model_config,
+                             cache_config=CacheConfig(block_size=16),
+                             speculative_config=speculative_config,
+                             device_config=DeviceConfig(device="tpu"),
+                             parallel_config=ParallelConfig(
+                                 pipeline_parallel_size=1,
+                                 tensor_parallel_size=1),
+                             load_config=LoadConfig(),
+                             scheduler_config=SchedulerConfig(
+                                 max_num_batched_tokens=8192,
+                                 max_num_seqs=128,
+                                 max_model_len=model_config.max_model_len,
+                                 is_encoder_decoder=False))
+    # Mock the runner, as the proposer needs it for initialization
+    mock_runner = mock.MagicMock()
+    # Create a real mesh for testing sharding-related logic
+    devices = np.array(jax.devices())
+    mock_runner.mesh = jax.sharding.Mesh(devices, axis_names=('model', ))
+    mock_runner.max_num_tokens = 8192
+    mock_runner.max_model_len = 8192
+    mock_runner.kv_cache_config.kv_cache_groups = [mock.MagicMock()]
+    mock_runner.input_batch = mock.MagicMock()
+    return Eagle3Proposer(vllm_config=vllm_config, runner=mock_runner)
+def test_prepare_inputs():
+    """
+    Mirrors the GPU test for prepare_inputs, adapted for JAX.
+    - cu_target_query_lens: [0, a, a + b, a + b + c]
+    - num_rejected_tokens: [n1, n2, n3]
+    - num_tokens_per_req: [a - n1, b - n2, c - n3]
+    - cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
+    - token_indices: [0, ..., a - n1 - 1, a, ..., a + b - n2 - 1, ...]
+    """
+    proposer = _create_proposer("eagle3", 1)
+    num_reqs = 3
+    max_num_seqs = 128
+    max_num_blocks_per_req = 10  # Mock value
+    # Mock runner attributes
+    proposer.runner.input_batch.num_reqs = num_reqs
+    proposer.runner.num_tokens_paddings = runner_utils.get_token_paddings(
+        min_token_size=16, max_token_size=1024, padding_gap=0)
+    # Mocks required by _prepare_draft_inputs helper
+    proposer.combine_hidden_states_fn = lambda state, h: h  # Mock passthrough
+    proposer.state = None  # Mock state
+    proposer.runner.input_batch.block_table = [mock.MagicMock()]
+    # Mock the block table return value (2D array)
+    (proposer.runner.input_batch.block_table[0].get_cpu_tensor.return_value
+     ) = jnp.zeros((num_reqs, max_num_blocks_per_req), dtype=jnp.int32)
+    # --- Setup sequence data ---
+    qsl_cpu = np.zeros(max_num_seqs + 1, dtype=np.int32)
+    query_lens = np.zeros(max_num_seqs, dtype=np.int32)
+    query_lens[:num_reqs] = [4, 7, 5]
+    qsl_cpu[1:] = np.cumsum(query_lens)
+    sl_cpu = np.zeros(max_num_seqs, dtype=np.int32)
+    sl_cpu[:num_reqs] = [4, 7, 5]
+    # Inputs
+    total_tokens = 16
+    hidden_size = 128
+    # The input_ids should be large enough to be indexed by token_indices,
+    # which can access up to total_tokens for padded requests.
+    input_ids = jnp.arange(total_tokens + 1)
+    aux_hidden_states = (jnp.ones((total_tokens + 1, hidden_size)),
+                         jnp.ones((total_tokens + 1, hidden_size)),
+                         jnp.ones((total_tokens + 1, hidden_size)))
+    num_rejected_tokens_cpu = np.zeros(max_num_seqs, dtype=np.int32)
+    num_rejected_tokens_cpu[:num_reqs] = [1, 3, 2]
+    num_rejected_tokens = jnp.array(num_rejected_tokens_cpu)
+    # This is only used in the _prepare_input_ids helper
+    # It must be padded to max_num_seqs (128) to match the mask in jnp.where
+    next_token_ids_cpu = np.zeros(max_num_seqs, dtype=np.int32)
+    next_token_ids_cpu[:num_reqs] = [1, 2, 3]  # Valid tokens for active reqs
+    next_token_ids = jnp.array(next_token_ids_cpu)
+    attn_metadata = AttentionMetadata(
+        seq_lens=jnp.array(sl_cpu),
+        input_positions=jnp.arange(total_tokens),
+        query_start_loc=jnp.array(qsl_cpu),
+        block_tables=jnp.array([]),  # This will be replaced by the mock
+        request_distribution=None,
+    )
+    attn_metadata.query_start_loc_cpu = qsl_cpu
+    attn_metadata.seq_lens_cpu = sl_cpu
+    # Expected results
+    expected_new_qsl = np.zeros(max_num_seqs + 1, dtype=np.int32)
+    num_tokens_per_req = np.zeros(max_num_seqs, dtype=np.int32)
+    num_tokens_per_req[:num_reqs] = [3, 4, 3]
+    # The implementation sets padded query lengths to 1, and rejected tokens
+    # are 0 for padded requests.
+    num_tokens_per_req[num_reqs:] = 1
+    expected_new_qsl[1:] = np.cumsum(num_tokens_per_req)
+    expected_new_seq_lens = np.zeros(max_num_seqs, dtype=np.int32)
+    expected_new_seq_lens[:num_reqs] = [3, 4, 3]
+    expected_total_tokens = int(expected_new_qsl[-1])
+    expected_total_tokens = runner_utils.get_padded_token_len(
+        proposer.runner.num_tokens_paddings, expected_total_tokens)
+    expected_last_token_indices = jnp.array(expected_new_qsl[1:] - 1)
+    # Execute
+    target_hidden_states, input_ids, last_token_indices, updated_metadata = (
+        proposer.prepare_inputs(attn_metadata, input_ids, aux_hidden_states,
+                                next_token_ids, num_rejected_tokens))
+    # Assertions
+    assert jnp.array_equal(updated_metadata.query_start_loc,
+                           jnp.array(expected_new_qsl))
+    assert jnp.array_equal(updated_metadata.seq_lens,
+                           jnp.array(expected_new_seq_lens))
+    assert jnp.array_equal(last_token_indices, expected_last_token_indices)
+    assert input_ids.shape == (expected_total_tokens, )
+    # NOTE: We don't check the content of target_token_ids for padded requests
+    # as it's complicated to construct the expected tensor. The shape check
+    # and the qsl/seq_len checks are sufficient to validate the logic.
+    # The concatenated hidden state shape should be (..., hidden_size * 3)
+    assert target_hidden_states.shape == (expected_total_tokens,
+                                          hidden_size * 3)
+@pytest.mark.parametrize("method", ["eagle3"])
+@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
+def test_propose(method, num_speculative_tokens):
+    proposer = _create_proposer(method, num_speculative_tokens)
+    # Mock the JAX model functions
+    hidden_size = 128
+    vocab_size = 100
+    batch_size = 2
+    seq_len_1 = 5
+    seq_len_2 = 3
+    total_tokens = seq_len_1 + seq_len_2
+    base_token_ids = [42, 60]
+    def mock_model_fn(state, kv_caches, input_ids, target_hidden_states,
+                      attn_metadata):
+        """
+        Mock model_fn.
+        Returns: (kv_caches, hidden_states_for_logits, residual_tuple)
+        - On first call (num_tokens == total_tokens):
+          Populate hidden_states_for_logits[last_token_indices] with base_token_ids.
+          Populate residual_tuple[0][last_token_indices] with base_token_ids.
+        - On loop calls (num_tokens == batch_size):
+          Use input_ids (previous draft token) to generate new token (input_ids + 1).
+          Populate hidden_states_for_logits with (input_ids + 1).
+          Populate residual_tuple[0] with (input_ids + 1).
+        """
+        num_tokens = input_ids.shape[0]
+        # This will be used for logits (output 2)
+        hidden_states_for_logits = jnp.zeros((num_tokens, hidden_size))
+        # This will be fed into the next step (output 3, item 0)
+        residual_hidden_states = jnp.zeros((num_tokens, hidden_size))
+        if num_tokens == total_tokens:
+            # First call in propose.
+            # `propose` will select from last_token_indices.
+            last_token_indices = attn_metadata.query_start_loc[1:] - 1
+            # Set logits output
+            hidden_states_for_logits = hidden_states_for_logits.at[
+                last_token_indices, 0].set(jnp.array(base_token_ids))
+            # Set residual for next step
+            residual_hidden_states = residual_hidden_states.at[
+                last_token_indices, 0].set(jnp.array(base_token_ids))
+        else:
+            # Subsequent calls in the loop
+            # input_ids is the previous draft token (shape `batch_size`)
+            # Mock logic: next token = previous token + 1
+            next_token_ids_encoded = input_ids + 1
+            # Set logits output
+            hidden_states_for_logits = hidden_states_for_logits.at[:, 0].set(
+                next_token_ids_encoded)
+            # Set residual for next step
+            residual_hidden_states = residual_hidden_states.at[:, 0].set(
+                next_token_ids_encoded)
+        # Return (kv_caches, hidden_states, residual_tuple)
+        return kv_caches, hidden_states_for_logits, (residual_hidden_states, )
+    def mock_compute_logits_fn(state, hidden_states, lora_metadata):
+        # Create deterministic logits from hidden_states.
+        # Takes the value from hidden_states[:, 0]
+        token_ids = hidden_states[:, 0].astype(jnp.int32)
+        return jax.nn.one_hot(token_ids, vocab_size)
+    def mock_combine_hidden_states_fn(state, hidden_states):
+        # Passthrough, as the mock doesn't need combination.
+        return hidden_states
+    proposer.model_fn = mock_model_fn
+    proposer.compute_logits_fn = mock_compute_logits_fn
+    proposer.combine_hidden_states_fn = mock_combine_hidden_states_fn
+    proposer.state = None  # Mock state
+    # Inputs
+    kv_caches = [None] * 1  # Mock kv_caches
+    # Create the 2D table first, as this is what the (unused) mock expects
+    block_tables_2d = jnp.zeros((batch_size, 10), dtype=jnp.int32)
+    attn_metadata = AttentionMetadata(
+        seq_lens=jnp.array([seq_len_1, seq_len_2]),
+        input_positions=jnp.concatenate(
+            [jnp.arange(seq_len_1),
+             jnp.arange(seq_len_2)]),
+        query_start_loc=jnp.array([0, seq_len_1, total_tokens]),
+        # Pass the FLATTENED table to simulate output of prepare_inputs
+        block_tables=block_tables_2d.reshape(-1),
+        request_distribution=None,
+    )
+    # These are the inputs to `propose`
+    # input_ids (from prepare_inputs)
+    target_token_ids = jnp.zeros(total_tokens, dtype=jnp.int32)
+    # target_hidden_states (from prepare_inputs)
+    target_hidden_states = jnp.zeros((total_tokens, hidden_size))
+    # last_token_indices (from prepare_inputs)
+    last_token_indices = attn_metadata.query_start_loc[1:] - 1
+    # Mock runner for block tables
+    # This mock isn't actually used by propose(), but we'll set it
+    # to the 2D table for correctness, as that's what
+    # _prepare_draft_inputs (called by prepare_inputs) would expect.
+    proposer.runner.input_batch.num_reqs = batch_size
+    proposer.runner.input_batch.block_table = [mock.MagicMock()]
+    (proposer.runner.input_batch.block_table[0].get_device_tensor.return_value
+     ) = block_tables_2d
+    # Execute
+    _, draft_token_ids = proposer.propose(
+        kv_caches,
+        target_token_ids,
+        attn_metadata,
+        last_token_indices,
+        target_hidden_states,
+    )
+    if draft_token_ids.ndim == 1:
+        draft_token_ids = jnp.expand_dims(draft_token_ids, axis=-1)
+    # Assertions
+    assert draft_token_ids.shape == (batch_size, num_speculative_tokens)
+    # Check the generated tokens
+    # Step 0: base_token_ids [42, 60]
+    # Step 1: [43, 61]
+    # Step 2: [44, 62]
+    # ...
+    expected_tokens = np.zeros((batch_size, num_speculative_tokens),
+                               dtype=np.int64)
+    for i in range(batch_size):
+        for j in range(num_speculative_tokens):
+            expected_tokens[i, j] = base_token_ids[i] + j
+    assert jnp.array_equal(draft_token_ids, jnp.array(expected_tokens))

tests/test_base.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 import unittest
 import warnings

tests/test_envs.py CHANGED Viewed

@@ -60,6 +60,7 @@ def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "0")
     monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "0")
     monkeypatch.setenv("NEW_MODEL_DESIGN", "0")
+    monkeypatch.setenv("ENABLE_QUANTIZED_MATMUL_KERNEL", "0")
     monkeypatch.setenv("USE_MOE_EP_KERNEL", "0")
     # Test SKIP_JAX_PRECOMPILE (default False)
@@ -86,6 +87,82 @@ def test_boolean_env_vars(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("USE_MOE_EP_KERNEL", "1")
     assert envs.USE_MOE_EP_KERNEL is True
+    # Test ENABLE_QUANTIZED_MATMUL_KERNEL (default False)
+    assert envs.ENABLE_QUANTIZED_MATMUL_KERNEL is False
+    monkeypatch.setenv("ENABLE_QUANTIZED_MATMUL_KERNEL", "1")
+    assert envs.ENABLE_QUANTIZED_MATMUL_KERNEL is True
+def test_boolean_env_vars_string_values(monkeypatch: pytest.MonkeyPatch):
+    """Test that boolean env vars accept string values like 'True' and 'False'"""
+    # Test NEW_MODEL_DESIGN with string "True"
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "True")
+    assert envs.NEW_MODEL_DESIGN is True
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "true")
+    assert envs.NEW_MODEL_DESIGN is True
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "False")
+    assert envs.NEW_MODEL_DESIGN is False
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "false")
+    assert envs.NEW_MODEL_DESIGN is False
+    # Test SKIP_JAX_PRECOMPILE with string values
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "True")
+    assert envs.SKIP_JAX_PRECOMPILE is True
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "false")
+    assert envs.SKIP_JAX_PRECOMPILE is False
+    # Test VLLM_XLA_CHECK_RECOMPILATION with string values
+    monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "TRUE")
+    assert envs.VLLM_XLA_CHECK_RECOMPILATION is True
+    monkeypatch.setenv("VLLM_XLA_CHECK_RECOMPILATION", "FALSE")
+    assert envs.VLLM_XLA_CHECK_RECOMPILATION is False
+    # Test USE_MOE_EP_KERNEL with string values
+    monkeypatch.setenv("USE_MOE_EP_KERNEL", "true")
+    assert envs.USE_MOE_EP_KERNEL is True
+    monkeypatch.setenv("USE_MOE_EP_KERNEL", "False")
+    assert envs.USE_MOE_EP_KERNEL is False
+def test_boolean_env_vars_invalid_values(monkeypatch: pytest.MonkeyPatch):
+    """Test that boolean env vars raise errors for invalid values"""
+    # Test invalid value for NEW_MODEL_DESIGN
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "yes")
+    with pytest.raises(
+            ValueError,
+            match="Invalid boolean value 'yes' for NEW_MODEL_DESIGN"):
+        _ = envs.NEW_MODEL_DESIGN
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "2")
+    with pytest.raises(ValueError,
+                       match="Invalid boolean value '2' for NEW_MODEL_DESIGN"):
+        _ = envs.NEW_MODEL_DESIGN
+    # Test invalid value for SKIP_JAX_PRECOMPILE
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "invalid")
+    with pytest.raises(
+            ValueError,
+            match="Invalid boolean value 'invalid' for SKIP_JAX_PRECOMPILE"):
+        _ = envs.SKIP_JAX_PRECOMPILE
+def test_boolean_env_vars_empty_string(monkeypatch: pytest.MonkeyPatch):
+    """Test that empty string returns default value"""
+    monkeypatch.setenv("NEW_MODEL_DESIGN", "")
+    assert envs.NEW_MODEL_DESIGN is False  # Should return default
+    monkeypatch.setenv("SKIP_JAX_PRECOMPILE", "")
+    assert envs.SKIP_JAX_PRECOMPILE is False  # Should return default
 def test_integer_env_vars(monkeypatch: pytest.MonkeyPatch):
     # Ensure clean environment for integer vars by setting to defaults
@@ -179,7 +256,7 @@ def test_disaggregated_serving_env_vars(monkeypatch: pytest.MonkeyPatch):
 def test_model_impl_type_default(monkeypatch: pytest.MonkeyPatch):
     monkeypatch.delenv("MODEL_IMPL_TYPE", raising=False)
-    assert envs.MODEL_IMPL_TYPE == "flax_nnx"
+    assert envs.MODEL_IMPL_TYPE == "auto"
 def test_cache_preserves_values_across_env_changes(

tests/test_tpu_info.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from unittest.mock import MagicMock, patch

tests/test_utils.py CHANGED Viewed

@@ -9,7 +9,7 @@ import pytest
 from tpu_inference.utils import (GBYTES, enable_megacore,
                                  get_jax_dtype_from_str_dtype, get_megacore,
                                  get_padded_head_dim, hbm_usage_bytes,
-                                 hbm_usage_gb, quantize_kv)
+                                 hbm_usage_gb)
 def test_enable_and_get_megacore():
@@ -182,48 +182,6 @@ def test_get_padded_head_dim(head_dim, expected_padded_head_dim):
     assert get_padded_head_dim(head_dim) == expected_padded_head_dim
-def test_quantize_kv_float8_e4m3fn():
-    """Tests the quantize_kv function with float8_e4m3fn dtype."""
-    key = jnp.array([-1.0, 0.5, 1.0, 1.5])
-    value = jnp.array([2.0, 0.0, -2.0, -3.0])
-    kv_cache_quantized_dtype = jnp.float8_e4m3fn
-    k_scale = 0.1
-    v_scale = 0.2
-    quantized_key, quantized_value = quantize_kv(key, value,
-                                                 kv_cache_quantized_dtype,
-                                                 k_scale, v_scale)
-    # Expected key: key / k_scale -> clip -> astype
-    # [-10., 5., 10., 15.] are within float8_e4m3fn range
-    expected_key = jnp.array([-10.0, 5.0, 10.0, 15.0], dtype=jnp.float8_e4m3fn)
-    # Expected value: value / v_scale -> clip -> astype
-    # [10., 0., -10., -15.] are within float8_e4m3fn range
-    expected_value = jnp.array([10.0, 0.0, -10.0, -15.0],
-                               dtype=jnp.float8_e4m3fn)
-    assert jnp.array_equal(quantized_key, expected_key)
-    assert jnp.array_equal(quantized_value, expected_value)
-    # Test clipping
-    dtype_info = jnp.finfo(kv_cache_quantized_dtype)
-    minval, maxval = float(dtype_info.min), float(dtype_info.max)
-    # Values that will be outside the range after scaling
-    key_clip = jnp.array([minval * k_scale * 2, maxval * k_scale * 2])
-    value_clip = jnp.array([maxval * v_scale * 2, minval * v_scale * 2])
-    quantized_key_clip, quantized_value_clip = quantize_kv(
-        key_clip, value_clip, kv_cache_quantized_dtype, k_scale, v_scale)
-    # Values should be clipped to the min/max of the float8 dtype
-    expected_key_clip = jnp.array([minval, maxval], dtype=jnp.float8_e4m3fn)
-    expected_value_clip = jnp.array([maxval, minval], dtype=jnp.float8_e4m3fn)
-    assert jnp.array_equal(quantized_key_clip, expected_key_clip)
-    assert jnp.array_equal(quantized_value_clip, expected_value_clip)
 def test_get_jax_dtype_from_str_dtype():
     """
     Test the get_jax_dtype_from_str_dtype function

tests/worker/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu-inference 0.11.1.dev202512030818__py3-none-any.whl → 0.13.2rc3__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202512030818py3-none-any.whl → 0.13.2rc3py3-none-any.whl