PyPI - tpu-inference - Versions diffs - 0.11.1__py3-none-any.whl - Mend

tpu-inference 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (168) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_adapters.py +83 -0
tests/core/test_core_tpu.py +523 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/test_lora.py +123 -0
tests/test_base.py +201 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +218 -0
tests/tpu_backend_test.py +59 -0
tpu_inference/__init__.py +30 -0
tpu_inference/adapters/__init__.py +0 -0
tpu_inference/adapters/vllm_adapters.py +42 -0
tpu_inference/adapters/vllm_config_adapters.py +134 -0
tpu_inference/backend.py +69 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/adapters.py +153 -0
tpu_inference/core/core_tpu.py +776 -0
tpu_inference/core/disagg_executor.py +117 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/di/__init__.py +0 -0
tpu_inference/di/abstracts.py +28 -0
tpu_inference/di/host.py +76 -0
tpu_inference/di/interfaces.py +51 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/tpu_connector.py +699 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +346 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/interfaces/__init__.py +0 -0
tpu_inference/interfaces/cache.py +31 -0
tpu_inference/interfaces/config.py +47 -0
tpu_inference/interfaces/config_parts.py +117 -0
tpu_inference/interfaces/engine.py +51 -0
tpu_inference/interfaces/outputs.py +22 -0
tpu_inference/interfaces/params.py +21 -0
tpu_inference/interfaces/platform.py +74 -0
tpu_inference/interfaces/request.py +39 -0
tpu_inference/interfaces/scheduler.py +31 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1447 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3834 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +47 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +254 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/attention_interface.py +356 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/binary_search.py +295 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +172 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +95 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +69 -0
tpu_inference/layers/jax/sharding.py +406 -0
tpu_inference/layers/jax/transformer_block.py +76 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +184 -0
tpu_inference/layers/vllm/fused_moe.py +399 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +34 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +121 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/unquantized.py +263 -0
tpu_inference/layers/vllm/sharding.py +151 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +308 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1233 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +433 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/llama3.py +366 -0
tpu_inference/models/jax/llama4.py +473 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +976 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +164 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +588 -0
tpu_inference/models/jax/utils/weight_utils.py +510 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +272 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_jax.py +257 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table_jax.py +122 -0
tpu_inference/runner/compilation_manager.py +672 -0
tpu_inference/runner/input_batch_jax.py +435 -0
tpu_inference/runner/kv_cache.py +119 -0
tpu_inference/runner/kv_cache_manager.py +460 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +208 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +250 -0
tpu_inference/runner/structured_decoding_manager.py +89 -0
tpu_inference/runner/tpu_jax_runner.py +771 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +334 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +294 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/_temporary_vllm_compat.py +129 -0
tpu_inference/worker/base.py +100 -0
tpu_inference/worker/tpu_worker_jax.py +321 -0
tpu_inference-0.11.1.dist-info/METADATA +101 -0
tpu_inference-0.11.1.dist-info/RECORD +168 -0
tpu_inference-0.11.1.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dist-info/top_level.txt +2 -0

tpu_inference/kernels/collectives/all_gather_matmul.py ADDED Viewed

@@ -0,0 +1,735 @@
+# SPDX-License-Identifier: Apache-2.0
+"""All-gather matmul kernel."""
+import functools
+import jax
+import jax.numpy as jnp
+from jax import lax
+from jax._src import dtypes
+from jax.experimental import pallas as pl
+from jax.experimental.pallas import tpu as pltpu
+from tpu_inference.kernels.collectives import (
+    all_gather_matmul_tuned_block_sizes, util)
+P = jax.sharding.PartitionSpec
+def _cdiv(x, y):
+    return (x + y - 1) // y
+# TODO(chengjiyao): try unrolling the loop instead of using pallas_call grid
+# TODO(chengjiyao): try m tiling
+# TODO(chengjiyao): try using [bm, bk] and [bk, bn] scratches memory shape for
+# large bm
+# TODO(chengjiyao): try splitting to two parts when n_per_device is large:
+# output_0, gatherd_x = ag-matmul(x, y_0)
+# output_1 = matmul(gatherd_x, y_1)
+# output = concat(output_0, output_1)
+# TODO(chengjiyao): investigate the register spilling
+def _all_gather_kernel(
+    # Inputs
+    x_hbm_ref,  # [m_per_device, k]
+    y_hbm_ref,  # [k, n_per_device]
+    # Outputs
+    o_hbm_ref,  # [m, n_per_device]
+    x_hbm_scratch_ref,  # [num_devices - 1, m_per_device, k]
+    # Scratches
+    x_local_copy_sem,  # []
+    y_local_copy_sem,  # []
+    o_local_copy_sem,  # []
+    send_sems,  # [2, num_devices - 1] for left and right
+    recv_sems,  # [2, num_devices - 1] for left and right
+    x_vmem_scratch_ref,  # [2, m_per_device, k]
+    y_vmem_scratch_ref,  # [k, n_per_device]
+    o_vmem_scratch_ref,  # [2, m_per_device, bn]
+    acc_vmem_scratch_ref,  # [m_per_device, bn] of jnp.float32
+    axis_name: str,
+    bn: int,
+    bk: int,
+    debug_mode=False,
+    rhs_transpose: bool = False,
+):
+    """Pallas kernel for all-gather.
+  Args:
+    x_hbm_ref: LHS of the matmul before all-gather.
+    y_hbm_ref: RHS of the matmul.
+    o_hbm_ref: Output of the matmul.
+    x_hbm_scratch_ref: Scratch memory for LHS of the matmul.
+    x_local_copy_sem: DMA semaphore for a local HBM-VMEM copy.
+    y_local_copy_sem: DMA semaphore for a local HBM-VMEM copy.
+    o_local_copy_sem: DMA semaphore for a local HBM-VMEM copy.
+    send_sem: DMA semaphore for the remote send.
+    capacity_sem: Capacity semaphore for the remote send.
+    recv_sems: DMA semaphore for the remote receive.
+    x_vmem_scratch_ref: Scratch memory for LHS of the matmul.
+    y_vmem_scratch_ref: Scratch memory for RHS of the matmul.
+    o_vmem_scratch_ref: Scratch memory for output of the matmul.
+  """
+    num_devices = pl.num_programs(0) - 2
+    grid_n = pl.num_programs(1)
+    grid_k = pl.num_programs(2)
+    outer_step = pl.program_id(0)
+    bn_i = pl.program_id(1)
+    bk_i = pl.program_id(2)
+    global_step_id = outer_step * grid_n * grid_k + bn_i * grid_k + bk_i
+    mxu_total_steps = num_devices * grid_n * grid_k
+    gn_by_gk = grid_n * grid_k
+    my_id = lax.axis_index(axis_name)
+    left_neighbor = lax.rem(my_id + num_devices - 1, jnp.int32(num_devices))
+    right_neighbor = lax.rem(my_id + 1, jnp.int32(num_devices))
+    x_hbm_receiving_slot = outer_step
+    x_hbm_working_slot = outer_step - 1
+    x_vmem_receiving_slot = outer_step % 2
+    x_vmem_working_slot = (global_step_id - 1) // gn_by_gk % 2
+    o_receiving_slot = lax.rem((global_step_id + grid_k - 1) // grid_k, 2)
+    o_working_slot = 1 - o_receiving_slot
+    m_per_device, _ = x_hbm_ref.shape
+    m_per_device_per_direction = m_per_device // 2
+    def debug_print(msg, *args):
+        if debug_mode:
+            @pl.when(my_id == 0)
+            def _debug_print():
+                pl.debug_print(msg, *args)
+    def _start_or_wait_copy(
+        op: jax._src.pallas.mosaic.primitives.AsyncCopyDescriptor,
+        wait: bool = False,
+    ):
+        if wait:
+            op.wait()
+        else:
+            op.start()
+    def _do_first_x_local_copy(wait: bool = False):
+        debug_print(
+            "[AGMM debug, wait={}] do first x local copy, x_vmem_receiving_slot={},"
+            " bk_i={}",
+            int(wait),
+            x_vmem_receiving_slot,
+            bk_i,
+        )
+        k_slice = pl.ds(bk_i * bk, bk)
+        x_local_copy_op = pltpu.make_async_copy(
+            src_ref=x_hbm_ref.at[:, k_slice],
+            dst_ref=x_vmem_scratch_ref.at[x_vmem_receiving_slot, :, k_slice],
+            sem=x_local_copy_sem,
+        )
+        _start_or_wait_copy(x_local_copy_op, wait)
+    def _do_subsequent_x_left_local_copy(wait: bool = False):
+        debug_print(
+            "[AGMM debug, wait={}] do subsequent x left local copy,"
+            " x_hbm_working_slot={}, x_vmem_receiving_slot={}, bk_i={}",
+            int(wait),
+            x_hbm_working_slot,
+            x_vmem_receiving_slot,
+            bk_i,
+        )
+        k_slice = pl.ds(bk_i * bk, bk)
+        x_local_copy_op = pltpu.make_async_copy(
+            src_ref=x_hbm_scratch_ref.at[
+                x_hbm_working_slot,
+                :m_per_device_per_direction,
+                k_slice,
+            ],
+            dst_ref=x_vmem_scratch_ref.at[
+                x_vmem_receiving_slot,
+                :m_per_device_per_direction,
+                k_slice,
+            ],
+            sem=x_local_copy_sem,
+        )
+        _start_or_wait_copy(x_local_copy_op, wait)
+    def _do_subsequent_x_right_local_copy(wait: bool = False):
+        debug_print(
+            "[AGMM debug, wait={}] do subsequent x right local copy,"
+            " x_hbm_working_slot={}, x_vmem_receiving_slot={}, bk_i={}",
+            int(wait),
+            x_hbm_working_slot,
+            x_vmem_receiving_slot,
+            bk_i,
+        )
+        x_local_copy_op = pltpu.make_async_copy(
+            src_ref=x_hbm_scratch_ref.at[
+                x_hbm_working_slot,
+                m_per_device_per_direction:,
+                pl.ds(bk_i * bk, bk),
+            ],
+            dst_ref=x_vmem_scratch_ref.at[
+                x_vmem_receiving_slot,
+                m_per_device_per_direction:,
+                pl.ds(bk_i * bk, bk),
+            ],
+            sem=x_local_copy_sem,
+        )
+        _start_or_wait_copy(x_local_copy_op, wait)
+    def _do_y_local_copy(wait: bool = False):
+        debug_print(
+            "[AGMM debug, wait={}] do y local copy, bk_i={}, bn_i={}",
+            int(wait),
+            bk_i,
+            bn_i,
+        )
+        k_slice = pl.ds(bk_i * bk, bk)
+        n_slice = pl.ds(bn_i * bn, bn)
+        if rhs_transpose:
+            y_local_copy_op = pltpu.make_async_copy(
+                src_ref=y_hbm_ref.at[n_slice, k_slice],
+                dst_ref=y_vmem_scratch_ref.at[n_slice, k_slice],
+                sem=y_local_copy_sem,
+            )
+        else:
+            y_local_copy_op = pltpu.make_async_copy(
+                src_ref=y_hbm_ref.at[k_slice, n_slice],
+                dst_ref=y_vmem_scratch_ref.at[k_slice, n_slice],
+                sem=y_local_copy_sem,
+            )
+        _start_or_wait_copy(y_local_copy_op, wait)
+    def _do_first_left_remote_copy(wait: bool = False):
+        debug_print(
+            "[AGMM debug, wait={}] do first left remote copy,"
+            " x_hbm_receiving_slot={}, x_hbm_working_slot={}",
+            int(wait),
+            x_hbm_receiving_slot,
+            x_hbm_working_slot,
+        )
+        left_remote_copy_op = pltpu.make_async_remote_copy(
+            src_ref=x_hbm_ref.at[0:m_per_device_per_direction],
+            dst_ref=x_hbm_scratch_ref.at[x_hbm_receiving_slot,
+                                         0:m_per_device_per_direction],
+            send_sem=send_sems.at[0, outer_step],
+            recv_sem=recv_sems.at[0, outer_step],
+            device_id=(left_neighbor, ),
+            device_id_type=pltpu.DeviceIdType.MESH,
+        )
+        _start_or_wait_copy(left_remote_copy_op, wait)
+    def _do_first_right_remote_copy(wait: bool = False):
+        debug_print(
+            "[AGMM debug, wait={}] do first right remote copy,"
+            " x_hbm_receiving_slot={}, x_hbm_working_slot={}",
+            int(wait),
+            x_hbm_receiving_slot,
+            x_hbm_working_slot,
+        )
+        right_remote_copy_op = pltpu.make_async_remote_copy(
+            src_ref=x_hbm_ref.at[m_per_device_per_direction:m_per_device],
+            dst_ref=x_hbm_scratch_ref.at[
+                x_hbm_receiving_slot, m_per_device_per_direction:m_per_device],
+            send_sem=send_sems.at[1, outer_step],
+            recv_sem=recv_sems.at[1, outer_step],
+            device_id=(right_neighbor, ),
+            device_id_type=pltpu.DeviceIdType.MESH,
+        )
+        _start_or_wait_copy(right_remote_copy_op, wait)
+    def _do_subsequent_left_remote_copy(wait: bool = False):
+        debug_print(
+            "[AGMM debug, wait={}] do subsequent left remote copy,"
+            " x_hbm_receiving_slot={}, x_hbm_working_slot={}",
+            int(wait),
+            x_hbm_receiving_slot,
+            x_hbm_working_slot,
+        )
+        left_remote_copy_op = pltpu.make_async_remote_copy(
+            src_ref=x_hbm_scratch_ref.at[x_hbm_working_slot,
+                                         0:m_per_device_per_direction],
+            dst_ref=x_hbm_scratch_ref.at[x_hbm_receiving_slot,
+                                         0:m_per_device_per_direction],
+            send_sem=send_sems.at[0, outer_step],
+            recv_sem=recv_sems.at[0, outer_step],
+            device_id=(left_neighbor, ),
+            device_id_type=pltpu.DeviceIdType.MESH,
+        )
+        _start_or_wait_copy(left_remote_copy_op, wait)
+    def _do_subsequent_right_remote_copy(wait: bool = False):
+        debug_print(
+            "[AGMM debug, wait={}] do subsequent right remote copy,"
+            " x_hbm_receiving_slot={}, x_hbm_working_slot={}",
+            int(wait),
+            x_hbm_receiving_slot,
+            x_hbm_working_slot,
+        )
+        right_remote_copy_op = pltpu.make_async_remote_copy(
+            src_ref=x_hbm_scratch_ref.at[
+                x_hbm_working_slot, m_per_device_per_direction:m_per_device],
+            dst_ref=x_hbm_scratch_ref.at[
+                x_hbm_receiving_slot, m_per_device_per_direction:m_per_device],
+            send_sem=send_sems.at[1, outer_step],
+            recv_sem=recv_sems.at[1, outer_step],
+            device_id=(right_neighbor, ),
+            device_id_type=pltpu.DeviceIdType.MESH,
+        )
+        _start_or_wait_copy(right_remote_copy_op, wait)
+    def _do_mxu():
+        working_global_step_id = global_step_id - 1
+        working_bk_i = working_global_step_id % grid_k
+        working_bn_i = working_global_step_id % gn_by_gk // grid_k
+        debug_print(
+            "[AGMM debug] do mxu, x_vmem_working_slot={}, o_receiving_slot={},"
+            " working_bk_i={}, working_bn_i={}",
+            x_vmem_working_slot,
+            o_receiving_slot,
+            working_bk_i,
+            working_bn_i,
+        )
+        k_slice = pl.ds(working_bk_i * bk, bk)
+        n_slice = pl.ds(working_bn_i * bn, bn)
+        if grid_k == 1:
+            if rhs_transpose:
+                lhs = x_vmem_scratch_ref.at[x_vmem_working_slot][...]
+                rhs = y_vmem_scratch_ref.at[n_slice, :][...]
+                o_vmem_scratch_ref.at[o_receiving_slot][...] = lax.dot_general(
+                    lhs,
+                    rhs,
+                    dimension_numbers=(((1, ), (1, )), ((), ())),
+                    preferred_element_type=jnp.float32,
+                ).astype(x_vmem_scratch_ref.dtype)
+            else:
+                o_vmem_scratch_ref.at[o_receiving_slot][...] = jnp.dot(
+                    x_vmem_scratch_ref.at[x_vmem_working_slot][...],
+                    y_vmem_scratch_ref.at[:, n_slice][...],
+                    preferred_element_type=jnp.float32,
+                ).astype(x_vmem_scratch_ref.dtype)
+        else:
+            # TODO(chengjiyao): optimize the vstore
+            if rhs_transpose:
+                lhs = x_vmem_scratch_ref.at[x_vmem_working_slot, :,
+                                            k_slice][...]
+                rhs = y_vmem_scratch_ref.at[n_slice, k_slice][...]
+                acc_vmem_scratch_ref[...] += lax.dot_general(
+                    lhs,
+                    rhs,
+                    dimension_numbers=(((1, ), (1, )), ((), ())),
+                    preferred_element_type=jnp.float32,
+                )
+            else:
+                acc_vmem_scratch_ref[...] += jnp.dot(
+                    x_vmem_scratch_ref.at[x_vmem_working_slot, :,
+                                          k_slice][...],
+                    y_vmem_scratch_ref.at[k_slice, n_slice][...],
+                    preferred_element_type=jnp.float32,
+                )
+            @pl.when(working_bk_i == grid_k - 1)
+            def _update():
+                debug_print(
+                    "[AGMM debug] update, o_receiving_slot={}",
+                    o_receiving_slot,
+                )
+                o_vmem_scratch_ref.at[o_receiving_slot][
+                    ...] = acc_vmem_scratch_ref[...].astype(
+                        x_vmem_scratch_ref.dtype)
+                # TODO(chengjiyao): based on the kyuyeunk' suggestion:
+                # this logic can be more optimized. right now it does this.
+                # line 316 performs dot
+                # line 316 loads from acc_vmem_scartch_ref
+                # line 316 adds resulting dot with acc_vmem_scratch_ref
+                # line 316 stores result into acc_vmem_scratch_ref
+                # line 335 loads from acc_vmem_scratch_ref again.
+                # line 338 zero initializes & stores it to acc_vmem_scratch_ref
+                # better way would be
+                # perform dot
+                # if working_bk_i != 0, load from acc_vmem_scratch_ref and add result
+                # from previous step. If not, skip this process.
+                # if working_bk_i == gk - 1, store the result from step 2 into
+                # o_vmem_scratch_ref, if not, store it into acc_vmem_scratch_ref
+                acc_vmem_scratch_ref[...] = jnp.zeros_like(
+                    acc_vmem_scratch_ref)
+    def _do_o_local_copy(wait: bool = False):
+        working_global_step_id = global_step_id - grid_k - 1
+        working_bn_i = (working_global_step_id % gn_by_gk) // grid_k
+        n_slice = pl.ds(working_bn_i * bn, bn)
+        offset = (global_step_id - 2) // gn_by_gk
+        left_o_idx = (my_id + offset) % num_devices
+        left_o_idx = left_o_idx * 2
+        right_o_idx = (my_id - offset + num_devices) % num_devices
+        right_o_idx = right_o_idx * 2 + 1
+        debug_print(
+            "[AGMM debug, wait={}] do o local copy, o_working_slot={},"
+            " left_o_idx={}, right_o_idx={}, working_bn_i={}",
+            int(wait),
+            o_working_slot,
+            left_o_idx,
+            right_o_idx,
+            working_bn_i,
+        )
+        o_left_local_copy_op = pltpu.make_async_copy(
+            src_ref=o_vmem_scratch_ref.at[
+                o_working_slot, :m_per_device_per_direction],
+            dst_ref=o_hbm_ref.at[
+                pl.ds(
+                    m_per_device_per_direction * left_o_idx,
+                    m_per_device_per_direction,
+                ),
+                n_slice,
+            ],
+            sem=o_local_copy_sem,
+        )
+        o_right_local_copy_op = pltpu.make_async_copy(
+            src_ref=o_vmem_scratch_ref.at[o_working_slot,
+                                          m_per_device_per_direction:],
+            dst_ref=o_hbm_ref.at[
+                pl.ds(
+                    m_per_device_per_direction * right_o_idx,
+                    m_per_device_per_direction,
+                ),
+                n_slice,
+            ],
+            sem=o_local_copy_sem,
+        )
+        _start_or_wait_copy(o_left_local_copy_op, wait)
+        _start_or_wait_copy(o_right_local_copy_op, wait)
+    ### ------- Kernel start ------- ###
+    # TODO(chengjiyao): explore a fine-grained way to do the waits and signal
+    debug_print(
+        "===== starting a grid, outer_step={}, bn_i={}, bk_i={} =====",
+        outer_step,
+        bn_i,
+        bk_i,
+    )
+    @pl.when(global_step_id == 0)
+    @jax.named_scope("_start_first_remote_copy")
+    def _start_first_remote_copy():
+        if grid_k > 1:
+            acc_vmem_scratch_ref[...] = jnp.zeros_like(acc_vmem_scratch_ref)
+        # Barrier with both neighbors at the start, since we will be
+        # communicating with both.
+        util.local_barrier(left_neighbor, right_neighbor)
+        _do_first_left_remote_copy(wait=False)
+        _do_first_right_remote_copy(wait=False)
+    cond_start_subsequent_remote_copy = jnp.logical_and(
+        jnp.logical_and(outer_step > 0, outer_step < num_devices - 1),
+        global_step_id % gn_by_gk == 0,
+    )
+    @pl.when(cond_start_subsequent_remote_copy)
+    @jax.named_scope("_start_subsequent_remote_copy")
+    def _start_subsequent_remote_copy():
+        _do_subsequent_left_remote_copy(wait=False)
+        _do_subsequent_right_remote_copy(wait=False)
+    @pl.when(jnp.logical_and(outer_step == 0, bn_i == 0))
+    @jax.named_scope("_start_first_local_x_copy")
+    def _start_first_x_local_copy():
+        _do_first_x_local_copy(wait=False)
+    cond_subsequent_x_local_copy = jnp.logical_and(
+        jnp.logical_and(outer_step > 0, outer_step < num_devices), bn_i == 0)
+    @pl.when(cond_subsequent_x_local_copy)
+    @jax.named_scope("_start_subsequent_x_local_copy")
+    def _start_subsequent_x_local_copy():
+        _do_subsequent_x_left_local_copy(wait=False)
+        _do_subsequent_x_right_local_copy(wait=False)
+    @pl.when(outer_step == 0)
+    @jax.named_scope("_start_y_local_copy")
+    def _start_y_local_copy():
+        _do_y_local_copy(wait=False)
+    def _get_start_o_local_copy_cond():
+        if grid_k == 1:
+            return jnp.logical_and(global_step_id >= 2, global_step_id
+                                   < mxu_total_steps + 2)
+        else:
+            return jnp.logical_and(
+                jnp.logical_and(
+                    global_step_id >= grid_k + 1,
+                    global_step_id < mxu_total_steps + grid_k + 1,
+                ),
+                global_step_id % grid_k == 1,
+            )
+    @pl.when(_get_start_o_local_copy_cond())
+    @jax.named_scope("_start_o_local_copy")
+    def _start_o_local_copy():
+        _do_o_local_copy(wait=False)
+    @pl.when(
+        jnp.logical_and(global_step_id >= 1, global_step_id
+                        < 1 + mxu_total_steps))
+    @jax.named_scope("_mxu")
+    def _mxu():
+        _do_mxu()
+    def _get_wait_o_local_copy_cond():
+        if grid_k == 1:
+            return jnp.logical_and(global_step_id >= 2, global_step_id
+                                   < mxu_total_steps + 2)
+        else:
+            return jnp.logical_and(
+                jnp.logical_and(
+                    global_step_id >= grid_k + 1,
+                    global_step_id < mxu_total_steps + grid_k + 1,
+                ),
+                global_step_id % grid_k == 0,
+            )
+    @pl.when(_get_wait_o_local_copy_cond())
+    @jax.named_scope("_wait_o_local_copy")
+    def _wait_o_local_copy():
+        _do_o_local_copy(wait=True)
+    @pl.when(outer_step == 0)
+    @jax.named_scope("_wait_y_local_copy")
+    def _wait_y_local_copy():
+        _do_y_local_copy(wait=True)
+    @pl.when(jnp.logical_and(outer_step == 0, bn_i == 0))
+    @jax.named_scope("_wait_first_x_local_copy")
+    def _wait_first_x_local_copy():
+        _do_first_x_local_copy(wait=True)
+    @pl.when(cond_subsequent_x_local_copy)
+    @jax.named_scope("_wait_subsequent_x_local_copy")
+    def _wait_subsequent_x_local_copy():
+        _do_subsequent_x_left_local_copy(wait=True)
+        _do_subsequent_x_right_local_copy(wait=True)
+    @pl.when(global_step_id == gn_by_gk - 1)
+    @jax.named_scope("_wait_first_remote_copy")
+    def _wait_first_remote_copy():
+        _do_first_left_remote_copy(wait=True)
+        _do_first_right_remote_copy(wait=True)
+    cond_wait_subsequent_remote_copy = jnp.logical_and(
+        jnp.logical_and(outer_step > 0, outer_step < num_devices - 1),
+        global_step_id % gn_by_gk == gn_by_gk - 1,
+    )
+    @pl.when(cond_wait_subsequent_remote_copy)
+    @jax.named_scope("_wait_subsequent_remote_copy")
+    def _wait_subsequent_remote_copy():
+        _do_subsequent_left_remote_copy(wait=True)
+        _do_subsequent_right_remote_copy(wait=True)
+    ### ------- Kernel end ------- ###
+# FIXME(chengjiyao): make it accurate for the cases of quantization
+def get_vmem_estimate_bytes(
+    m,
+    n,
+    k,
+    bn,
+    acc_bytes,
+    tp_size,
+    x_dtype,
+    y_dtype,
+    out_dtype,
+):
+    """Returns the total vmem bytes used by the kernel."""
+    m_per_device = m // tp_size
+    n_per_device = n // tp_size
+    y_vmem_bytes = n_per_device * k * dtypes.bit_width(y_dtype) // 8
+    total_bytes = (
+        2 * m_per_device * k * dtypes.bit_width(x_dtype) //
+        8  # x_vmem_scratch_ref
+        + y_vmem_bytes  # y_vmem_scratch_ref
+        + 2 * m * bn * dtypes.bit_width(out_dtype) // 8  # o_vmem_scratch_ref
+        + acc_bytes  # acc_vmem_scratch_ref, jnp.float32
+    )
+    return total_bytes
+def validate_inputs(x, y, tp_size, rhs_transpose=False):
+    """Validates the inputs to the all_gather_matmul kernel."""
+    if x.ndim != 2 or y.ndim != 2:
+        raise ValueError(
+            f"Inputs must be 2D, got shapes {x.shape} and {y.shape}.")
+    if x.dtype != y.dtype:
+        raise ValueError(
+            f"Input dtypes must match, got {x.dtype} and {y.dtype}.")
+    m, k = x.shape
+    if rhs_transpose:
+        n, k_from_y = y.shape
+    else:
+        k_from_y, n = y.shape
+    if k != k_from_y:
+        raise ValueError(
+            "Incompatible shapes for matmul: contracting dimension mismatch:"
+            f" {x.shape} and {y.shape}.")
+    if k % 128 != 0:
+        raise ValueError(f"k ({k}) must be divisible by 128.")
+    if n % 128 != 0:
+        raise ValueError(f"n ({n}) must be divisible by 128.")
+    m_per_device_per_direction = m // tp_size // 2
+    if m_per_device_per_direction % 8 != 0:
+        raise ValueError(f"m ({m}) must be divisible by {{tp_size * 2 * 8}}.")
+    if m % (tp_size * 2) != 0:
+        raise ValueError(
+            f"x.shape[0] ({m}) must be divisible by tp_size * 2 ({tp_size * 2})'."
+        )
+    if n % tp_size != 0:
+        raise ValueError(
+            f"y.shape[{0 if rhs_transpose else 1}] ({n}) must be divisible by"
+            f" tp_size ({tp_size}) on axis '{tp_size}'.")
+def all_gather_matmul(
+    x: jax.Array,
+    y: jax.Array,
+    mesh: jax.sharding.AbstractMesh,
+    axis_name: str,
+    collective_id: int | None = 0,
+    bn: int | None = None,
+    bk: int | None = None,
+    rhs_transpose: bool = False,
+):
+    """Performs all-gather on the input tensor and then a matmul.
+  Args:
+    x: LHS of the matmul before all-gather.
+    y: RHS of the matmul.
+    mesh: JAX mesh.
+    axis_name: Name of the axis to all-gather over.
+    collective_id: An integer used for barrier semaphore allocation.
+    bn: Number of blocks in the n dimension.
+    bk: Number of blocks in the k dimension.
+    rhs_transpose: If True, y is transposed.
+  Returns:
+    all-gather(x, axis=0) @ y
+  """
+    tp_size = mesh.shape[axis_name]
+    validate_inputs(x, y, tp_size, rhs_transpose)
+    m, k = x.shape
+    if rhs_transpose:
+        n, _ = y.shape
+        y_in_spec = P(axis_name, None)
+    else:
+        _, n = y.shape
+        y_in_spec = P(None, axis_name)
+    m_per_device = m // tp_size
+    n_per_device = n // tp_size
+    tuned_bn, tuned_bk = (
+        all_gather_matmul_tuned_block_sizes.get_tuned_block_sizes(
+            m, n, k,
+            jnp.dtype(x.dtype).name, tp_size))
+    if bn is None:
+        bn = tuned_bn if tuned_bn is not None else n
+    if bk is None:
+        bk = tuned_bk if tuned_bk is not None else k
+    grid_n = _cdiv(n_per_device, bn)
+    grid_k = _cdiv(k, bk)
+    acc_shape = (m_per_device, bn)
+    # NOTE(chengjiyao): acc buffer is not used in the grid_k == 1 case.
+    if grid_k == 1:
+        acc_shape = (8, 128)
+    acc_bytes = acc_shape[0] * acc_shape[1] * dtypes.bit_width(
+        jnp.float32) // 8
+    y_vmem_shape = (n_per_device, k) if rhs_transpose else (k, n_per_device)
+    estimated_vmem_bytes = get_vmem_estimate_bytes(
+        m,
+        n,
+        k,
+        bn,
+        acc_bytes,
+        tp_size,
+        x.dtype,
+        y.dtype,
+        x.dtype,
+    )
+    out_shape = [
+        jax.ShapeDtypeStruct((m, n_per_device), x.dtype),  # output
+        jax.ShapeDtypeStruct((tp_size - 1, m_per_device, k),
+                             x.dtype),  # x HBM scratch
+    ]
+    grid_spec = pltpu.PrefetchScalarGridSpec(
+        num_scalar_prefetch=0,
+        in_specs=[
+            pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
+            pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
+        ],
+        out_specs=[
+            pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
+            pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
+        ],
+        scratch_shapes=(
+            pltpu.SemaphoreType.DMA,  # x_local_copy_sem
+            pltpu.SemaphoreType.DMA,  # y_local_copy_sem
+            pltpu.SemaphoreType.DMA,  # o_local_copy_sem
+            pltpu.SemaphoreType.DMA(
+                (2, tp_size - 1)),  # left and right send semaphores
+            pltpu.SemaphoreType.DMA((
+                2,
+                tp_size - 1,
+            )),  # left and right recv semaphores
+            pltpu.VMEM((2, m_per_device, k), x.dtype),  # x vmem scratch
+            pltpu.VMEM(y_vmem_shape, y.dtype),  # y vmem scratch
+            pltpu.VMEM((2, m_per_device, bn), x.dtype),  # output vmem scratch
+            pltpu.VMEM(acc_shape, jnp.float32),  # acc vmem scratch
+        ),
+        grid=(tp_size + 2, grid_n, grid_k),
+    )
+    flops = 2 * m * k * n_per_device
+    bytes_accessed = x.dtype.itemsize * (m * k + k * n_per_device +
+                                         m * n_per_device)
+    cost_estimate = pl.CostEstimate(flops=flops,
+                                    bytes_accessed=bytes_accessed,
+                                    transcendentals=0)
+    @functools.partial(jax.jit, static_argnames=["bn", "bk", "rhs_transpose"])
+    def _all_gather_matmul_call(x, y, bn, bk, rhs_transpose):
+        return pl.pallas_call(
+            functools.partial(
+                _all_gather_kernel,
+                bn=bn,
+                bk=bk,
+                axis_name=axis_name,
+                rhs_transpose=rhs_transpose,
+            ),
+            out_shape=out_shape,
+            grid_spec=grid_spec,
+            compiler_params=pltpu.CompilerParams(
+                collective_id=collective_id,
+                vmem_limit_bytes=estimated_vmem_bytes + 8 * 1024 * 1024,
+            ),
+            cost_estimate=cost_estimate,
+            name=get_kernel_name(bn, bk, rhs_transpose),
+        )(x, y)[0]
+    shard_map_kernel = jax.jit(
+        jax.shard_map(
+            functools.partial(
+                _all_gather_matmul_call,
+                bn=bn,
+                bk=bk,
+                rhs_transpose=rhs_transpose,
+            ),
+            mesh=mesh,
+            in_specs=(P(axis_name, None), y_in_spec),
+            out_specs=P(None, axis_name),
+            check_vma=False,
+        ), )
+    return shard_map_kernel(x, y)
+def get_kernel_name(bn: int, bk: int, rhs_transpose: bool):
+    return (
+        f"all_gather_matmul_kernel_bn_{bn}_bk_{bk}_rhs_transpose_{rhs_transpose}"
+    )