tpu-inference 0.12.0.dev20251207__tar.gz → 0.12.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tpu-inference might be problematic. Click here for more details.
- {tpu_inference-0.12.0.dev20251207/tpu_inference.egg-info → tpu_inference-0.12.0rc1}/PKG-INFO +2 -2
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/requirements.txt +1 -1
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/mla_v1_test.py +41 -129
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_quantization.py +0 -3
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/mla/v1/kernel.py +120 -98
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/attention_interface.py +1 -1
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/sharding.py +2 -6
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/deepseek_v3_attention.py +64 -232
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/fused_moe.py +204 -117
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/mxfp4.py +71 -61
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/unquantized.py +58 -46
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/common/model_loader.py +2 -5
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/deepseek_v3.py +64 -185
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/gpt_oss.py +3 -3
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/quantization_utils.py +2 -4
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/weight_utils.py +1 -7
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/platforms/tpu_platform.py +3 -7
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/compilation_manager.py +2 -3
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/kv_cache.py +20 -38
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/kv_cache_manager.py +15 -31
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/tpu_runner.py +2 -9
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/utils.py +5 -9
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/worker/tpu_worker.py +10 -24
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1/tpu_inference.egg-info}/PKG-INFO +2 -2
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference.egg-info/requires.txt +1 -1
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/LICENSE +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/MANIFEST.in +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/README.md +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/pyproject.toml +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/setup.cfg +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/setup.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_core_tpu.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_disagg_executor.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_disagg_utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_dp_scheduler.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_init.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/fused_moe_v1_test.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/quantized_matmul_kernel_test.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_kv_cache_update_v2_test.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_paged_attention_kernel_v2_test.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_paged_attention_kernel_v3_test.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/conftest.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/test_bgmv.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/test_layers.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/test_lora.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_base.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_envs.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_tpu_info.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/core_tpu.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/disagg_executor.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/disagg_utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/sched/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/sched/dp_scheduler.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/jax_parallel_state.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/tpu_connector.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/env_override.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/envs.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/executors/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/executors/ray_distributed_executor.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/experimental/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/experimental/llama3_jax_stashed.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/collectives/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/collectives/all_gather_matmul.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/collectives/util.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/flash_attention/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/flash_attention/kernel.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/fused_moe/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/fused_moe/v1/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/fused_moe/v1/kernel.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/mla/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/mla/v1/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/quantized_matmul/kernel.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/quantized_matmul/util.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/util.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/attention_metadata.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/binary_search.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/quant_methods.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/attention.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/llama4_attention.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/base.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/constants.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/layers.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/misc.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/moe/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/moe/deepseek_v3_moe.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/moe/gpt_oss_moe.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/moe/moe.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/rope.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/rope_interface.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/sample/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/sample/rejection_sampler.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/sample/sampling.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/sample/sampling_metadata.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/transformer_block.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/attention.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/linear_common.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/awq.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/common.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/sharding.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/logger.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/lora/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/lora/torch_lora_ops.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/lora/torch_punica_tpu.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/common/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/jax_intermediate_tensor.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/llama3.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/llama4.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/llama_eagle3.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/llama_guard_4.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/qwen2.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/qwen2_5_vl.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/qwen3.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/file_utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/multi_modal_utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/vllm/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/vllm/vllm_model_wrapper.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/vllm/vllm_model_wrapper_context.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/platforms/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/block_table.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/input_batch.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/lora_utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/multimodal_manager.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/persistent_batch_manager.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/speculative_decoding_manager.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/structured_decoding_manager.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/utils.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/spec_decode/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/spec_decode/jax/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/spec_decode/jax/eagle3.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/tpu_info.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/worker/__init__.py +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference.egg-info/SOURCES.txt +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference.egg-info/dependency_links.txt +0 -0
- {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference.egg-info/top_level.txt +0 -0
{tpu_inference-0.12.0.dev20251207/tpu_inference.egg-info → tpu_inference-0.12.0rc1}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tpu_inference
|
|
3
|
-
Version: 0.12.
|
|
3
|
+
Version: 0.12.0rc1
|
|
4
4
|
Author: tpu_inference Contributors
|
|
5
5
|
Classifier: Development Status :: 3 - Alpha
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -25,7 +25,7 @@ Requires-Dist: jax[tpu]==0.8.0
|
|
|
25
25
|
Requires-Dist: jaxlib==0.8.0
|
|
26
26
|
Requires-Dist: jaxtyping
|
|
27
27
|
Requires-Dist: flax==0.11.1
|
|
28
|
-
Requires-Dist: torchax==0.0.
|
|
28
|
+
Requires-Dist: torchax==0.0.7
|
|
29
29
|
Requires-Dist: qwix==0.1.1
|
|
30
30
|
Requires-Dist: torchvision==0.24.0
|
|
31
31
|
Requires-Dist: pathwaysutils
|
|
@@ -42,7 +42,6 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
|
|
|
42
42
|
|
|
43
43
|
padded_r_dim = align_to(r_dim, 128)
|
|
44
44
|
padded_lkv_dim = align_to(lkv_dim, 128)
|
|
45
|
-
padded_kv_dim = padded_lkv_dim + padded_r_dim
|
|
46
45
|
packing = get_dtype_packing(kv_dtype)
|
|
47
46
|
q_lens = [s[0] for s in seq_lens]
|
|
48
47
|
kv_lens_list = [s[1] for s in seq_lens]
|
|
@@ -70,10 +69,13 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
|
|
|
70
69
|
new_kv_c = gen_random((total_q_len, lkv_dim), kv_dtype)
|
|
71
70
|
new_k_pe = gen_random((total_q_len, r_dim), kv_dtype)
|
|
72
71
|
|
|
73
|
-
|
|
74
|
-
(total_num_pages, page_size // packing, packing,
|
|
72
|
+
cache_kv_c = gen_random(
|
|
73
|
+
(total_num_pages, page_size // packing, packing, padded_lkv_dim),
|
|
75
74
|
kv_dtype,
|
|
76
75
|
)
|
|
76
|
+
cache_k_pe = gen_random(
|
|
77
|
+
(total_num_pages, page_size // packing, packing, padded_r_dim),
|
|
78
|
+
kv_dtype)
|
|
77
79
|
kv_lens = jnp.array(kv_lens_list, dtype=jnp.int32)
|
|
78
80
|
page_indices = jnp.array(page_indices_list, dtype=jnp.int32)
|
|
79
81
|
cu_q_lens = jnp.array(cu_q_lens_list, dtype=jnp.int32)
|
|
@@ -82,13 +84,14 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
|
|
|
82
84
|
ql_nope_for_kernel = ql_nope.copy()
|
|
83
85
|
q_pe_for_kernel = q_pe.copy()
|
|
84
86
|
|
|
85
|
-
expected_out,
|
|
87
|
+
expected_out, expected_updated_kv_c, expeceted_updated_k_pe = (
|
|
86
88
|
mla.ref_mla_ragged_paged_attention(
|
|
87
89
|
ql_nope,
|
|
88
90
|
q_pe,
|
|
89
91
|
new_kv_c,
|
|
90
92
|
new_k_pe,
|
|
91
|
-
|
|
93
|
+
cache_kv_c.copy(),
|
|
94
|
+
cache_k_pe.copy(),
|
|
92
95
|
kv_lens,
|
|
93
96
|
page_indices,
|
|
94
97
|
cu_q_lens,
|
|
@@ -98,140 +101,49 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
|
|
|
98
101
|
soft_cap=soft_cap,
|
|
99
102
|
))
|
|
100
103
|
|
|
101
|
-
kernel_out,
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
104
|
+
kernel_out, kernel_updated_kv_c, kernel_updated_k_pe = (
|
|
105
|
+
mla.mla_ragged_paged_attention(
|
|
106
|
+
ql_nope_for_kernel,
|
|
107
|
+
q_pe_for_kernel,
|
|
108
|
+
new_kv_c,
|
|
109
|
+
new_k_pe,
|
|
110
|
+
cache_kv_c.copy(),
|
|
111
|
+
cache_k_pe.copy(),
|
|
112
|
+
kv_lens,
|
|
113
|
+
page_indices,
|
|
114
|
+
cu_q_lens,
|
|
115
|
+
distribution,
|
|
116
|
+
sm_scale=sm_scale,
|
|
117
|
+
sliding_window=sliding_window,
|
|
118
|
+
soft_cap=soft_cap,
|
|
119
|
+
num_kv_pages_per_block=num_kv_pages_per_block,
|
|
120
|
+
num_queries_per_block=num_queries_per_block,
|
|
121
|
+
vmem_limit_bytes=vmem_limit_bytes,
|
|
122
|
+
))
|
|
118
123
|
|
|
119
124
|
self.assertEqual(expected_out.shape,
|
|
120
125
|
(total_q_len, num_heads, padded_lkv_dim))
|
|
121
126
|
self.assertEqual(
|
|
122
|
-
|
|
123
|
-
(total_num_pages, page_size // packing, packing,
|
|
127
|
+
expected_updated_kv_c.shape,
|
|
128
|
+
(total_num_pages, page_size // packing, packing, padded_lkv_dim),
|
|
129
|
+
)
|
|
130
|
+
self.assertEqual(
|
|
131
|
+
expeceted_updated_k_pe.shape,
|
|
132
|
+
(total_num_pages, page_size // packing, packing, padded_r_dim),
|
|
124
133
|
)
|
|
125
134
|
self.assertEqual(expected_out.dtype, kv_dtype)
|
|
126
|
-
self.assertEqual(
|
|
135
|
+
self.assertEqual(expected_updated_kv_c.dtype, kv_dtype)
|
|
136
|
+
self.assertEqual(expeceted_updated_k_pe.dtype, kv_dtype)
|
|
127
137
|
|
|
128
138
|
self.assertAllClose(expected_out, kernel_out, atol=0.2, rtol=0.2)
|
|
129
|
-
self.assertAllClose(
|
|
130
|
-
|
|
139
|
+
self.assertAllClose(expected_updated_kv_c,
|
|
140
|
+
kernel_updated_kv_c,
|
|
141
|
+
atol=0.2,
|
|
142
|
+
rtol=0.2)
|
|
143
|
+
self.assertAllClose(expeceted_updated_k_pe,
|
|
144
|
+
kernel_updated_k_pe,
|
|
131
145
|
atol=0.2,
|
|
132
146
|
rtol=0.2)
|
|
133
|
-
|
|
134
|
-
def test_update_kv_cache(self):
|
|
135
|
-
lkv_dim = 4
|
|
136
|
-
r_dim = 4
|
|
137
|
-
padded_lkv_dim = align_to(lkv_dim, 128)
|
|
138
|
-
padded_r_dim = align_to(r_dim, 128)
|
|
139
|
-
kv_dtype = jnp.bfloat16
|
|
140
|
-
new_kv_c = jnp.arange(16, dtype=kv_dtype).reshape((4, lkv_dim))
|
|
141
|
-
new_k_pe = (jnp.arange(16, dtype=kv_dtype).reshape((4, r_dim)) + 100)
|
|
142
|
-
total_num_pages = 2
|
|
143
|
-
page_size = 4
|
|
144
|
-
cache_kv_shape = mla.get_kv_cache_shape(
|
|
145
|
-
total_num_pages,
|
|
146
|
-
page_size,
|
|
147
|
-
padded_lkv_dim + padded_r_dim,
|
|
148
|
-
kv_dtype,
|
|
149
|
-
)
|
|
150
|
-
cache_kv = jnp.zeros(cache_kv_shape, dtype=kv_dtype)
|
|
151
|
-
|
|
152
|
-
# two sequences, first with 3 tokens, second with 1 token
|
|
153
|
-
kv_lens = jnp.array([3, 1], dtype=jnp.int32)
|
|
154
|
-
# first seq uses page 0, second uses page 1
|
|
155
|
-
page_indices = jnp.array([0, -1, 1, -1], dtype=jnp.int32)
|
|
156
|
-
# three tokens for first seq, one for second
|
|
157
|
-
cu_q_lens = jnp.array([0, 3, 4], dtype=jnp.int32)
|
|
158
|
-
distribution = jnp.array([0, 0, 2], dtype=jnp.int32)
|
|
159
|
-
|
|
160
|
-
# manually compute the expected cache
|
|
161
|
-
padded_new_kv_c = jnp.pad(new_kv_c,
|
|
162
|
-
((0, 0), (0, padded_lkv_dim - lkv_dim)),
|
|
163
|
-
constant_values=0)
|
|
164
|
-
padded_new_k_pe = jnp.pad(new_k_pe,
|
|
165
|
-
((0, 0), (0, padded_r_dim - r_dim)),
|
|
166
|
-
constant_values=0)
|
|
167
|
-
|
|
168
|
-
expected_cache = cache_kv
|
|
169
|
-
# First sequence
|
|
170
|
-
# token 0
|
|
171
|
-
page_idx, row, col = 0, 0, 0
|
|
172
|
-
expected_cache = expected_cache.at[page_idx, row,
|
|
173
|
-
col, :padded_lkv_dim].set(
|
|
174
|
-
padded_new_kv_c[0])
|
|
175
|
-
expected_cache = expected_cache.at[page_idx, row, col,
|
|
176
|
-
padded_lkv_dim:padded_lkv_dim +
|
|
177
|
-
padded_r_dim].set(
|
|
178
|
-
padded_new_k_pe[0])
|
|
179
|
-
# token 1
|
|
180
|
-
page_idx, row, col = 0, 0, 1
|
|
181
|
-
expected_cache = expected_cache.at[page_idx, row,
|
|
182
|
-
col, :padded_lkv_dim].set(
|
|
183
|
-
padded_new_kv_c[1])
|
|
184
|
-
expected_cache = expected_cache.at[page_idx, row, col,
|
|
185
|
-
padded_lkv_dim:padded_lkv_dim +
|
|
186
|
-
padded_r_dim].set(
|
|
187
|
-
padded_new_k_pe[1])
|
|
188
|
-
# token 2
|
|
189
|
-
page_idx, row, col = 0, 1, 0
|
|
190
|
-
expected_cache = expected_cache.at[page_idx, row,
|
|
191
|
-
col, :padded_lkv_dim].set(
|
|
192
|
-
padded_new_kv_c[2])
|
|
193
|
-
expected_cache = expected_cache.at[page_idx, row, col,
|
|
194
|
-
padded_lkv_dim:padded_lkv_dim +
|
|
195
|
-
padded_r_dim].set(
|
|
196
|
-
padded_new_k_pe[2])
|
|
197
|
-
|
|
198
|
-
# Second sequence
|
|
199
|
-
# token 0
|
|
200
|
-
page_idx, row, col = 1, 0, 0
|
|
201
|
-
expected_cache = expected_cache.at[page_idx, row,
|
|
202
|
-
col, :padded_lkv_dim].set(
|
|
203
|
-
padded_new_kv_c[3])
|
|
204
|
-
expected_cache = expected_cache.at[page_idx, row, col,
|
|
205
|
-
padded_lkv_dim:padded_lkv_dim +
|
|
206
|
-
padded_r_dim].set(
|
|
207
|
-
padded_new_k_pe[3])
|
|
208
|
-
|
|
209
|
-
updated_cache = mla.update_kv_cache(
|
|
210
|
-
new_kv_c,
|
|
211
|
-
new_k_pe,
|
|
212
|
-
cache_kv,
|
|
213
|
-
kv_lens,
|
|
214
|
-
page_indices,
|
|
215
|
-
cu_q_lens,
|
|
216
|
-
distribution,
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
self.assertAllClose(updated_cache, expected_cache)
|
|
220
|
-
|
|
221
|
-
def test_get_kv_cache_shape(self):
|
|
222
|
-
total_num_pages = 10
|
|
223
|
-
page_size = 16
|
|
224
|
-
lkv_dim = 128
|
|
225
|
-
kv_dtype = jnp.bfloat16
|
|
226
|
-
# The calculation for the expected shape is as follows:
|
|
227
|
-
# kv_packing is determined by the dtype, which is 2 for bfloat16.
|
|
228
|
-
# The second dimension is page_size / kv_packing = 16 / 2 = 8
|
|
229
|
-
# The third dimension is kv_packing = 2
|
|
230
|
-
# The fourth dimension is lkv_dim aligned to 128, which is 128
|
|
231
|
-
expected_shape = (10, 8, 2, 128)
|
|
232
|
-
self.assertEqual(
|
|
233
|
-
mla.get_kv_cache_shape(total_num_pages, page_size, lkv_dim,
|
|
234
|
-
kv_dtype), expected_shape)
|
|
235
147
|
|
|
236
148
|
def test_ragged_paged_attention_basic(self):
|
|
237
149
|
dtype = jnp.bfloat16
|
|
@@ -112,8 +112,6 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
|
|
|
112
112
|
self.mesh = Mesh(jax.devices(), ('model', ))
|
|
113
113
|
self.rng = jax.random.PRNGKey(0)
|
|
114
114
|
self.model = SimpleModel(rngs=nnx.Rngs(0))
|
|
115
|
-
self.model.vllm_config = MagicMock()
|
|
116
|
-
self.model.vllm_config.model_config.use_mla = False
|
|
117
115
|
|
|
118
116
|
self.qwix_config = [
|
|
119
117
|
{
|
|
@@ -133,7 +131,6 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
|
|
|
133
131
|
"""Test that qwix.quantize_model is called with the correct arguments."""
|
|
134
132
|
quantized_model_mock = MagicMock(spec=nnx.Module)
|
|
135
133
|
mock_quantize_model.return_value = quantized_model_mock
|
|
136
|
-
self.model.vllm_config.sharding_config.total_dp_size = 1
|
|
137
134
|
|
|
138
135
|
with patch(
|
|
139
136
|
"tpu_inference.models.jax.utils.quantization.quantization_utils.init_logger",
|