PyPI - tpu-inference - Versions diffs - 0.0.1rc1__py3-none-any.whl - Mend

tpu-inference 0.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (174) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_dp_scheduler.py +899 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/fused_moe_v1_test.py +374 -0
tests/kernels/mla_v1_test.py +396 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +549 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/conftest.py +32 -0
tests/lora/test_bgmv.py +43 -0
tests/lora/test_layers.py +648 -0
tests/lora/test_lora.py +133 -0
tests/lora/utils.py +88 -0
tests/test_base.py +201 -0
tests/test_envs.py +203 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +235 -0
tpu_inference/__init__.py +53 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +0 -0
tpu_inference/core/sched/dp_scheduler.py +523 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/jax_parallel_state.py +67 -0
tpu_inference/distributed/tpu_connector.py +727 -0
tpu_inference/distributed/utils.py +60 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +160 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +382 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +0 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +0 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1566 -0
tpu_inference/kernels/mla/__init__.py +0 -0
tpu_inference/kernels/mla/v1/__init__.py +0 -0
tpu_inference/kernels/mla/v1/kernel.py +1349 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1501 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1603 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4147 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +367 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +51 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_interface.py +396 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +8 -0
tpu_inference/layers/common/sharding.py +582 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +255 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +262 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +185 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +280 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +96 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +76 -0
tpu_inference/layers/jax/transformer_block.py +107 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +469 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +39 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +110 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +120 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +203 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +331 -0
tpu_inference/layers/vllm/quantization/unquantized.py +368 -0
tpu_inference/layers/vllm/sharding.py +230 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +478 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/gpt_oss.py +492 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +79 -0
tpu_inference/models/jax/llama3.py +376 -0
tpu_inference/models/jax/llama4.py +629 -0
tpu_inference/models/jax/llama_eagle3.py +336 -0
tpu_inference/models/jax/llama_guard_4.py +361 -0
tpu_inference/models/jax/qwen2.py +376 -0
tpu_inference/models/jax/qwen2_5_vl.py +1218 -0
tpu_inference/models/jax/qwen3.py +303 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +163 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +5 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +6 -0
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +5 -0
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +6 -0
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +105 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +650 -0
tpu_inference/models/jax/utils/weight_utils.py +584 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +293 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_platform.py +275 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +865 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +132 -0
tpu_inference/runner/kv_cache_manager.py +478 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +217 -0
tpu_inference/runner/persistent_batch_manager.py +282 -0
tpu_inference/runner/speculative_decoding_manager.py +248 -0
tpu_inference/runner/structured_decoding_manager.py +87 -0
tpu_inference/runner/tpu_runner.py +1744 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +417 -0
tpu_inference/tpu_info.py +78 -0
tpu_inference/utils.py +340 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/tpu_worker.py +458 -0
tpu_inference-0.0.1rc1.dist-info/METADATA +108 -0
tpu_inference-0.0.1rc1.dist-info/RECORD +174 -0
tpu_inference-0.0.1rc1.dist-info/WHEEL +5 -0
tpu_inference-0.0.1rc1.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.0.1rc1.dist-info/top_level.txt +2 -0

tpu_inference/distributed/jax_parallel_state.py ADDED Viewed

@@ -0,0 +1,67 @@
+from typing import Any, Optional
+import jax
+from jax.experimental import transfer
+BASE_JAX_PORT = 5000
+class GroupCoordinator:
+    """
+    Jax ProcessGroup wrapper for a group of Pipeline Parallel processes.
+    This is a simplfied version which aligns the APIs with pytorch's
+        GroupdCoordinator in vllm/distributed/parallel_state.py.
+    GroupCoordinator takes charge of the communication operations among
+        the processes in the group. Currently the communication is
+        send/recv intermediate tensor (tensor_dict) between consecutive PP
+        processes.
+    """
+    rank_in_group: int
+    world_size: int
+    transfer_server: Optional[Any]
+    connection: Optional[Any]
+    def __init__(self, rank_in_group: int, world_size: int):
+        self.rank_in_group = rank_in_group
+        self.world_size = world_size
+        self.transfer_server = None
+        self.connection = None
+    def send_tensor_dict(self, uuid: int, tensor_dict: dict[str, jax.Array]):
+        self.transfer_server.await_pull(uuid, tensor_dict)
+    def recv_tensor_dict(self, uuid: int,
+                         tensor_spec: dict[str, jax.ShapeDtypeStruct]):
+        return self.connection.pull(uuid, tensor_spec)
+    @property
+    def is_first_rank(self):
+        return self.rank_in_group == 0
+    @property
+    def is_last_rank(self):
+        return self.rank_in_group == self.world_size - 1
+def init_pp_distributed_environment(ip: str, rank: int, world_size: int,
+                                    device: Any, need_pp: bool):
+    global _PP
+    _PP = GroupCoordinator(rank, world_size)
+    if need_pp:
+        port_number = BASE_JAX_PORT + rank
+        server_address = f"{ip}:{port_number}"
+        transfer_server = transfer.start_transfer_server(
+            device.client, server_address, [f"{ip}:0", f"{ip}:0"])
+        _PP.transfer_server = transfer_server
+def connect(prev_ip: str, prev_rank: int):
+    prev_port_number = BASE_JAX_PORT + prev_rank
+    connection = _PP.transfer_server.connect(f'{prev_ip}:{prev_port_number}')
+    _PP.connection = connection
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, (
+        "pipeline model parallel group is not initialized")
+    return _PP