tpu-inference 0.12.0.dev20251207__tar.gz → 0.12.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tpu-inference might be problematic. Click here for more details.

Files changed (182) hide show
  1. {tpu_inference-0.12.0.dev20251207/tpu_inference.egg-info → tpu_inference-0.12.0rc1}/PKG-INFO +2 -2
  2. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/requirements.txt +1 -1
  3. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/mla_v1_test.py +41 -129
  4. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_quantization.py +0 -3
  5. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/mla/v1/kernel.py +120 -98
  6. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/attention_interface.py +1 -1
  7. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/sharding.py +2 -6
  8. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/deepseek_v3_attention.py +64 -232
  9. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/gpt_oss_attention.py +5 -5
  10. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/fused_moe.py +204 -117
  11. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/mxfp4.py +71 -61
  12. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/unquantized.py +58 -46
  13. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/common/model_loader.py +2 -5
  14. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/deepseek_v3.py +64 -185
  15. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/gpt_oss.py +3 -3
  16. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/quantization_utils.py +2 -4
  17. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/weight_utils.py +1 -7
  18. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/platforms/tpu_platform.py +3 -7
  19. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/compilation_manager.py +2 -3
  20. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/kv_cache.py +20 -38
  21. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/kv_cache_manager.py +15 -31
  22. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/tpu_runner.py +2 -9
  23. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/utils.py +5 -9
  24. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/worker/tpu_worker.py +10 -24
  25. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1/tpu_inference.egg-info}/PKG-INFO +2 -2
  26. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference.egg-info/requires.txt +1 -1
  27. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/LICENSE +0 -0
  28. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/MANIFEST.in +0 -0
  29. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/README.md +0 -0
  30. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/pyproject.toml +0 -0
  31. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/setup.cfg +0 -0
  32. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/setup.py +0 -0
  33. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/__init__.py +0 -0
  34. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/__init__.py +0 -0
  35. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_core_tpu.py +0 -0
  36. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_disagg_executor.py +0 -0
  37. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_disagg_utils.py +0 -0
  38. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_dp_scheduler.py +0 -0
  39. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/core/test_init.py +0 -0
  40. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/__init__.py +0 -0
  41. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/fused_moe_v1_test.py +0 -0
  42. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/quantized_matmul_kernel_test.py +0 -0
  43. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_kv_cache_update_v2_test.py +0 -0
  44. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_paged_attention_kernel_v2_test.py +0 -0
  45. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +0 -0
  46. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/kernels/ragged_paged_attention_kernel_v3_test.py +0 -0
  47. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/__init__.py +0 -0
  48. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/conftest.py +0 -0
  49. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/test_bgmv.py +0 -0
  50. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/test_layers.py +0 -0
  51. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/test_lora.py +0 -0
  52. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/lora/utils.py +0 -0
  53. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_base.py +0 -0
  54. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_envs.py +0 -0
  55. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_tpu_info.py +0 -0
  56. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tests/test_utils.py +0 -0
  57. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/__init__.py +0 -0
  58. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/__init__.py +0 -0
  59. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/core_tpu.py +0 -0
  60. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/disagg_executor.py +0 -0
  61. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/disagg_utils.py +0 -0
  62. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/sched/__init__.py +0 -0
  63. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/core/sched/dp_scheduler.py +0 -0
  64. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/__init__.py +0 -0
  65. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/jax_parallel_state.py +0 -0
  66. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/tpu_connector.py +0 -0
  67. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/distributed/utils.py +0 -0
  68. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/env_override.py +0 -0
  69. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/envs.py +0 -0
  70. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/executors/__init__.py +0 -0
  71. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/executors/ray_distributed_executor.py +0 -0
  72. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/experimental/__init__.py +0 -0
  73. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/experimental/llama3_jax_stashed.py +0 -0
  74. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/__init__.py +0 -0
  75. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/collectives/__init__.py +0 -0
  76. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/collectives/all_gather_matmul.py +0 -0
  77. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +0 -0
  78. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/collectives/util.py +0 -0
  79. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/flash_attention/__init__.py +0 -0
  80. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/flash_attention/kernel.py +0 -0
  81. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/fused_moe/__init__.py +0 -0
  82. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/fused_moe/v1/__init__.py +0 -0
  83. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/fused_moe/v1/kernel.py +0 -0
  84. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/mla/__init__.py +0 -0
  85. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/mla/v1/__init__.py +0 -0
  86. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
  87. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/quantized_matmul/kernel.py +0 -0
  88. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +0 -0
  89. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/quantized_matmul/util.py +0 -0
  90. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
  91. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
  92. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +0 -0
  93. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +0 -0
  94. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +0 -0
  95. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
  96. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +0 -0
  97. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +0 -0
  98. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +0 -0
  99. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +0 -0
  100. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/kernels/ragged_paged_attention/v3/util.py +0 -0
  101. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/__init__.py +0 -0
  102. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/__init__.py +0 -0
  103. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/attention_metadata.py +0 -0
  104. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/binary_search.py +0 -0
  105. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/common/quant_methods.py +0 -0
  106. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/__init__.py +0 -0
  107. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/__init__.py +0 -0
  108. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/attention.py +0 -0
  109. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/attention/llama4_attention.py +0 -0
  110. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/base.py +0 -0
  111. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/constants.py +0 -0
  112. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/layers.py +0 -0
  113. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/misc.py +0 -0
  114. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/moe/__init__.py +0 -0
  115. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/moe/deepseek_v3_moe.py +0 -0
  116. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/moe/gpt_oss_moe.py +0 -0
  117. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/moe/moe.py +0 -0
  118. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/rope.py +0 -0
  119. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/rope_interface.py +0 -0
  120. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/sample/__init__.py +0 -0
  121. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/sample/rejection_sampler.py +0 -0
  122. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/sample/sampling.py +0 -0
  123. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/sample/sampling_metadata.py +0 -0
  124. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/jax/transformer_block.py +0 -0
  125. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/__init__.py +0 -0
  126. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/attention.py +0 -0
  127. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/linear_common.py +0 -0
  128. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/__init__.py +0 -0
  129. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/awq.py +0 -0
  130. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/common.py +0 -0
  131. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
  132. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +0 -0
  133. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +0 -0
  134. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
  135. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +0 -0
  136. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +0 -0
  137. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/layers/vllm/sharding.py +0 -0
  138. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/logger.py +0 -0
  139. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/lora/__init__.py +0 -0
  140. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/lora/torch_lora_ops.py +0 -0
  141. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/lora/torch_punica_tpu.py +0 -0
  142. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/__init__.py +0 -0
  143. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/common/__init__.py +0 -0
  144. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/__init__.py +0 -0
  145. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/jax_intermediate_tensor.py +0 -0
  146. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/llama3.py +0 -0
  147. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/llama4.py +0 -0
  148. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/llama_eagle3.py +0 -0
  149. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/llama_guard_4.py +0 -0
  150. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/qwen2.py +0 -0
  151. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/qwen2_5_vl.py +0 -0
  152. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/qwen3.py +0 -0
  153. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/__init__.py +0 -0
  154. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/file_utils.py +0 -0
  155. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/multi_modal_utils.py +0 -0
  156. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
  157. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -0
  158. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -0
  159. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -0
  160. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -0
  161. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -0
  162. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/vllm/__init__.py +0 -0
  163. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/vllm/vllm_model_wrapper.py +0 -0
  164. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/models/vllm/vllm_model_wrapper_context.py +0 -0
  165. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/platforms/__init__.py +0 -0
  166. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/__init__.py +0 -0
  167. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/block_table.py +0 -0
  168. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/input_batch.py +0 -0
  169. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/lora_utils.py +0 -0
  170. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/multimodal_manager.py +0 -0
  171. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/persistent_batch_manager.py +0 -0
  172. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/speculative_decoding_manager.py +0 -0
  173. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/structured_decoding_manager.py +0 -0
  174. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/runner/utils.py +0 -0
  175. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/spec_decode/__init__.py +0 -0
  176. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/spec_decode/jax/__init__.py +0 -0
  177. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/spec_decode/jax/eagle3.py +0 -0
  178. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/tpu_info.py +0 -0
  179. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference/worker/__init__.py +0 -0
  180. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference.egg-info/SOURCES.txt +0 -0
  181. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference.egg-info/dependency_links.txt +0 -0
  182. {tpu_inference-0.12.0.dev20251207 → tpu_inference-0.12.0rc1}/tpu_inference.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tpu_inference
3
- Version: 0.12.0.dev20251207
3
+ Version: 0.12.0rc1
4
4
  Author: tpu_inference Contributors
5
5
  Classifier: Development Status :: 3 - Alpha
6
6
  Classifier: Intended Audience :: Developers
@@ -25,7 +25,7 @@ Requires-Dist: jax[tpu]==0.8.0
25
25
  Requires-Dist: jaxlib==0.8.0
26
26
  Requires-Dist: jaxtyping
27
27
  Requires-Dist: flax==0.11.1
28
- Requires-Dist: torchax==0.0.10
28
+ Requires-Dist: torchax==0.0.7
29
29
  Requires-Dist: qwix==0.1.1
30
30
  Requires-Dist: torchvision==0.24.0
31
31
  Requires-Dist: pathwaysutils
@@ -9,7 +9,7 @@ jax[tpu]==0.8.0
9
9
  jaxlib==0.8.0
10
10
  jaxtyping
11
11
  flax==0.11.1
12
- torchax==0.0.10
12
+ torchax==0.0.7
13
13
  qwix==0.1.1
14
14
  torchvision==0.24.0
15
15
  pathwaysutils
@@ -42,7 +42,6 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
42
42
 
43
43
  padded_r_dim = align_to(r_dim, 128)
44
44
  padded_lkv_dim = align_to(lkv_dim, 128)
45
- padded_kv_dim = padded_lkv_dim + padded_r_dim
46
45
  packing = get_dtype_packing(kv_dtype)
47
46
  q_lens = [s[0] for s in seq_lens]
48
47
  kv_lens_list = [s[1] for s in seq_lens]
@@ -70,10 +69,13 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
70
69
  new_kv_c = gen_random((total_q_len, lkv_dim), kv_dtype)
71
70
  new_k_pe = gen_random((total_q_len, r_dim), kv_dtype)
72
71
 
73
- cache_kv = gen_random(
74
- (total_num_pages, page_size // packing, packing, padded_kv_dim),
72
+ cache_kv_c = gen_random(
73
+ (total_num_pages, page_size // packing, packing, padded_lkv_dim),
75
74
  kv_dtype,
76
75
  )
76
+ cache_k_pe = gen_random(
77
+ (total_num_pages, page_size // packing, packing, padded_r_dim),
78
+ kv_dtype)
77
79
  kv_lens = jnp.array(kv_lens_list, dtype=jnp.int32)
78
80
  page_indices = jnp.array(page_indices_list, dtype=jnp.int32)
79
81
  cu_q_lens = jnp.array(cu_q_lens_list, dtype=jnp.int32)
@@ -82,13 +84,14 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
82
84
  ql_nope_for_kernel = ql_nope.copy()
83
85
  q_pe_for_kernel = q_pe.copy()
84
86
 
85
- expected_out, expected_updated_kv = (
87
+ expected_out, expected_updated_kv_c, expeceted_updated_k_pe = (
86
88
  mla.ref_mla_ragged_paged_attention(
87
89
  ql_nope,
88
90
  q_pe,
89
91
  new_kv_c,
90
92
  new_k_pe,
91
- cache_kv.copy(),
93
+ cache_kv_c.copy(),
94
+ cache_k_pe.copy(),
92
95
  kv_lens,
93
96
  page_indices,
94
97
  cu_q_lens,
@@ -98,140 +101,49 @@ class MlaRaggedPagedAttentionKernelTest(jtu.JaxTestCase):
98
101
  soft_cap=soft_cap,
99
102
  ))
100
103
 
101
- kernel_out, kernel_updated_kv = (mla.mla_ragged_paged_attention(
102
- ql_nope_for_kernel,
103
- q_pe_for_kernel,
104
- new_kv_c,
105
- new_k_pe,
106
- cache_kv.copy(),
107
- kv_lens,
108
- page_indices,
109
- cu_q_lens,
110
- distribution,
111
- sm_scale=sm_scale,
112
- sliding_window=sliding_window,
113
- soft_cap=soft_cap,
114
- num_kv_pages_per_block=num_kv_pages_per_block,
115
- num_queries_per_block=num_queries_per_block,
116
- vmem_limit_bytes=vmem_limit_bytes,
117
- ))
104
+ kernel_out, kernel_updated_kv_c, kernel_updated_k_pe = (
105
+ mla.mla_ragged_paged_attention(
106
+ ql_nope_for_kernel,
107
+ q_pe_for_kernel,
108
+ new_kv_c,
109
+ new_k_pe,
110
+ cache_kv_c.copy(),
111
+ cache_k_pe.copy(),
112
+ kv_lens,
113
+ page_indices,
114
+ cu_q_lens,
115
+ distribution,
116
+ sm_scale=sm_scale,
117
+ sliding_window=sliding_window,
118
+ soft_cap=soft_cap,
119
+ num_kv_pages_per_block=num_kv_pages_per_block,
120
+ num_queries_per_block=num_queries_per_block,
121
+ vmem_limit_bytes=vmem_limit_bytes,
122
+ ))
118
123
 
119
124
  self.assertEqual(expected_out.shape,
120
125
  (total_q_len, num_heads, padded_lkv_dim))
121
126
  self.assertEqual(
122
- expected_updated_kv.shape,
123
- (total_num_pages, page_size // packing, packing, padded_kv_dim),
127
+ expected_updated_kv_c.shape,
128
+ (total_num_pages, page_size // packing, packing, padded_lkv_dim),
129
+ )
130
+ self.assertEqual(
131
+ expeceted_updated_k_pe.shape,
132
+ (total_num_pages, page_size // packing, packing, padded_r_dim),
124
133
  )
125
134
  self.assertEqual(expected_out.dtype, kv_dtype)
126
- self.assertEqual(expected_updated_kv.dtype, kv_dtype)
135
+ self.assertEqual(expected_updated_kv_c.dtype, kv_dtype)
136
+ self.assertEqual(expeceted_updated_k_pe.dtype, kv_dtype)
127
137
 
128
138
  self.assertAllClose(expected_out, kernel_out, atol=0.2, rtol=0.2)
129
- self.assertAllClose(expected_updated_kv,
130
- kernel_updated_kv,
139
+ self.assertAllClose(expected_updated_kv_c,
140
+ kernel_updated_kv_c,
141
+ atol=0.2,
142
+ rtol=0.2)
143
+ self.assertAllClose(expeceted_updated_k_pe,
144
+ kernel_updated_k_pe,
131
145
  atol=0.2,
132
146
  rtol=0.2)
133
-
134
- def test_update_kv_cache(self):
135
- lkv_dim = 4
136
- r_dim = 4
137
- padded_lkv_dim = align_to(lkv_dim, 128)
138
- padded_r_dim = align_to(r_dim, 128)
139
- kv_dtype = jnp.bfloat16
140
- new_kv_c = jnp.arange(16, dtype=kv_dtype).reshape((4, lkv_dim))
141
- new_k_pe = (jnp.arange(16, dtype=kv_dtype).reshape((4, r_dim)) + 100)
142
- total_num_pages = 2
143
- page_size = 4
144
- cache_kv_shape = mla.get_kv_cache_shape(
145
- total_num_pages,
146
- page_size,
147
- padded_lkv_dim + padded_r_dim,
148
- kv_dtype,
149
- )
150
- cache_kv = jnp.zeros(cache_kv_shape, dtype=kv_dtype)
151
-
152
- # two sequences, first with 3 tokens, second with 1 token
153
- kv_lens = jnp.array([3, 1], dtype=jnp.int32)
154
- # first seq uses page 0, second uses page 1
155
- page_indices = jnp.array([0, -1, 1, -1], dtype=jnp.int32)
156
- # three tokens for first seq, one for second
157
- cu_q_lens = jnp.array([0, 3, 4], dtype=jnp.int32)
158
- distribution = jnp.array([0, 0, 2], dtype=jnp.int32)
159
-
160
- # manually compute the expected cache
161
- padded_new_kv_c = jnp.pad(new_kv_c,
162
- ((0, 0), (0, padded_lkv_dim - lkv_dim)),
163
- constant_values=0)
164
- padded_new_k_pe = jnp.pad(new_k_pe,
165
- ((0, 0), (0, padded_r_dim - r_dim)),
166
- constant_values=0)
167
-
168
- expected_cache = cache_kv
169
- # First sequence
170
- # token 0
171
- page_idx, row, col = 0, 0, 0
172
- expected_cache = expected_cache.at[page_idx, row,
173
- col, :padded_lkv_dim].set(
174
- padded_new_kv_c[0])
175
- expected_cache = expected_cache.at[page_idx, row, col,
176
- padded_lkv_dim:padded_lkv_dim +
177
- padded_r_dim].set(
178
- padded_new_k_pe[0])
179
- # token 1
180
- page_idx, row, col = 0, 0, 1
181
- expected_cache = expected_cache.at[page_idx, row,
182
- col, :padded_lkv_dim].set(
183
- padded_new_kv_c[1])
184
- expected_cache = expected_cache.at[page_idx, row, col,
185
- padded_lkv_dim:padded_lkv_dim +
186
- padded_r_dim].set(
187
- padded_new_k_pe[1])
188
- # token 2
189
- page_idx, row, col = 0, 1, 0
190
- expected_cache = expected_cache.at[page_idx, row,
191
- col, :padded_lkv_dim].set(
192
- padded_new_kv_c[2])
193
- expected_cache = expected_cache.at[page_idx, row, col,
194
- padded_lkv_dim:padded_lkv_dim +
195
- padded_r_dim].set(
196
- padded_new_k_pe[2])
197
-
198
- # Second sequence
199
- # token 0
200
- page_idx, row, col = 1, 0, 0
201
- expected_cache = expected_cache.at[page_idx, row,
202
- col, :padded_lkv_dim].set(
203
- padded_new_kv_c[3])
204
- expected_cache = expected_cache.at[page_idx, row, col,
205
- padded_lkv_dim:padded_lkv_dim +
206
- padded_r_dim].set(
207
- padded_new_k_pe[3])
208
-
209
- updated_cache = mla.update_kv_cache(
210
- new_kv_c,
211
- new_k_pe,
212
- cache_kv,
213
- kv_lens,
214
- page_indices,
215
- cu_q_lens,
216
- distribution,
217
- )
218
-
219
- self.assertAllClose(updated_cache, expected_cache)
220
-
221
- def test_get_kv_cache_shape(self):
222
- total_num_pages = 10
223
- page_size = 16
224
- lkv_dim = 128
225
- kv_dtype = jnp.bfloat16
226
- # The calculation for the expected shape is as follows:
227
- # kv_packing is determined by the dtype, which is 2 for bfloat16.
228
- # The second dimension is page_size / kv_packing = 16 / 2 = 8
229
- # The third dimension is kv_packing = 2
230
- # The fourth dimension is lkv_dim aligned to 128, which is 128
231
- expected_shape = (10, 8, 2, 128)
232
- self.assertEqual(
233
- mla.get_kv_cache_shape(total_num_pages, page_size, lkv_dim,
234
- kv_dtype), expected_shape)
235
147
 
236
148
  def test_ragged_paged_attention_basic(self):
237
149
  dtype = jnp.bfloat16
@@ -112,8 +112,6 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
112
112
  self.mesh = Mesh(jax.devices(), ('model', ))
113
113
  self.rng = jax.random.PRNGKey(0)
114
114
  self.model = SimpleModel(rngs=nnx.Rngs(0))
115
- self.model.vllm_config = MagicMock()
116
- self.model.vllm_config.model_config.use_mla = False
117
115
 
118
116
  self.qwix_config = [
119
117
  {
@@ -133,7 +131,6 @@ class TestQwixQuantizeNnxModel(unittest.TestCase):
133
131
  """Test that qwix.quantize_model is called with the correct arguments."""
134
132
  quantized_model_mock = MagicMock(spec=nnx.Module)
135
133
  mock_quantize_model.return_value = quantized_model_mock
136
- self.model.vllm_config.sharding_config.total_dp_size = 1
137
134
 
138
135
  with patch(
139
136
  "tpu_inference.models.jax.utils.quantization.quantization_utils.init_logger",