PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (248) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +14 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +14 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +180 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +25 -8
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +14 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_tpu_info.py +14 -0
tests/test_utils.py +1 -43
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +14 -9
tpu_inference/distributed/utils.py +56 -4
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +20 -3
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +171 -163
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +20 -26
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +112 -69
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +85 -65
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +374 -194
tpu_inference/kernels/ragged_paged_attention/v3/util.py +13 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +22 -3
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +52 -27
tpu_inference/layers/jax/attention/gpt_oss_attention.py +19 -6
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +100 -455
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +19 -5
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +119 -132
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +38 -26
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +133 -220
tpu_inference/layers/vllm/quantization/unquantized.py +154 -253
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +37 -16
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +113 -124
tpu_inference/models/jax/gpt_oss.py +23 -7
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +85 -24
tpu_inference/models/jax/utils/weight_utils.py +32 -1
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +22 -4
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +27 -29
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +69 -35
tpu_inference/runner/kv_cache.py +14 -0
tpu_inference/runner/kv_cache_manager.py +15 -2
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +14 -0
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +30 -10
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +13 -0
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +31 -30
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +23 -7
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +1 -1
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -208
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.12.0.dev20251213.dist-info/RECORD +0 -175
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.12.0.dev20251213.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tpu_inference/distributed/utils.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from vllm.utils.network_utils import get_ip
@@ -54,7 +68,45 @@ def get_side_channel_port() -> str:
     return port
-def get_node_id() -> int:
-    # TODO(xiang): Is it possible to get this from a pre-defiend env?
-    id = os.getenv("TPU_NODE_ID", 0)
-    return int(id)
+def get_device_topology_order_id(local_devices, global_devices) -> int:
+    """
+    Calculates the topology order ID for the local device set within the global topology.
+    This function determines the rank of the current host/process based on the
+    coordinate of its TPU devices relative to all devices in the topology.
+    Args:
+        local_devices: A list of TpuDevice objects available to the current process.
+        global_devices: A list of all TpuDevice objects in the global topology.
+    Returns:
+        The topology order ID (rank) of the local devices.
+    """
+    if not local_devices:
+        raise ValueError("local_devices cannot be empty")
+    if not global_devices:
+        raise ValueError("global_devices cannot be empty")
+    # 1. Find the 'anchor' (minimum coordinate) for the local devices.
+    #    This represents the physical top-left corner of the local machine.
+    local_anchor = min(d.coords for d in local_devices)
+    # 2. Group global devices by process to find the anchor for EVERY process.
+    process_anchors = {}
+    for d in global_devices:
+        pid = d.process_index
+        # Update the minimum coordinate found for this process so far
+        if pid not in process_anchors or d.coords < process_anchors[pid]:
+            process_anchors[pid] = d.coords
+    # 3. Sort the unique anchors to establish the canonical topology order.
+    #    Tuples (x, y, z) sort lexicographically (x first, then y, then z).
+    sorted_anchors = sorted(process_anchors.values())
+    # 4. Return the index (rank) of the local anchor in the sorted list.
+    try:
+        return sorted_anchors.index(local_anchor)
+    except ValueError:
+        raise ValueError(
+            f"Local devices: {local_devices} do not exist in the global device: {global_devices} list."
+        )

tpu_inference/executors/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/executors/ray_distributed_executor.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from array import array
 from typing import Any, Dict, List, Optional
@@ -6,7 +20,7 @@ import ray
 import vllm.envs as envs
 from ray.util.placement_group import PlacementGroup
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargsItem
 from vllm.platforms import current_platform
 from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
@@ -39,7 +53,7 @@ logger = init_logger(__name__)
 def _encode_hook(obj: Any) -> Any:
-    """Custom msgspec enc hook that supports array types and MultiModalKwargs.
+    """Custom msgspec enc hook that supports array types and MultiModalKwargsItem.
     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
     """
@@ -48,7 +62,7 @@ def _encode_hook(obj: Any) -> Any:
             f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
             f"Given array has a type code of {obj.typecode}.")
         return obj.tobytes()
-    if isinstance(obj, MultiModalKwargs):
+    if isinstance(obj, MultiModalKwargsItem):
         return dict(obj)
@@ -145,6 +159,9 @@ class RayDistributedExecutor(RayDistributedExecutorV1):
                 device_str: node['Resources'][device_str]
             } for node in ray_nodes]
         else:
+            assert pp_size == len(
+                ray_nodes
+            ), f"Cannot use PP across hosts, please set --pipeline-parallel-size to 1 or {len(ray_nodes)}"
             num_devices_per_pp_rank = self.vllm_config.sharding_config.total_devices
             placement_group_specs = [{
                 device_str: num_devices_per_pp_rank

tpu_inference/experimental/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/experimental/llama3_jax_stashed.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 # TODO: Update documentation
 from typing import List, Optional, Tuple

tpu_inference/kernels/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/collectives/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/flash_attention/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/fused_moe/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/fused_moe/v1/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/kernels/fused_moe/v1/kernel.py CHANGED Viewed

@@ -1,3 +1,16 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """TPU-Friendly Fused Mixture of Experts (MoE) kernel."""
 import functools
@@ -1376,171 +1389,166 @@ def fused_ep_moe(
     hbm_block_spec = pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM)
     renorm_str = "-renorm_k" if renormalize_topk_logits else ""
     scope_name = f"fused-moe-k_{top_k}{renorm_str}-bt_{bt}_{btc}-bf_{bf}_{bfc}-bd1_{bd1}_{bd1c}-bd2_{bd2}_{bd2c}"
-    fused_moe = jax.named_scope(scope_name)(
-        pl.pallas_call(
-            functools.partial(
-                _fused_ep_moe_kernel,
-                top_k=top_k,
-                renormalize_topk_logits=renormalize_topk_logits,
-                ep_axis_name=ep_axis_name,
-                act_fn=act_fn,
-                subc_quant_wsz=subc_quant_wsz,
-                bt=bt,
-                bf=bf,
-                bd1=bd1,
-                bd2=bd2,
-                btc=btc,
-                bfc=bfc,
-                bd1c=bd1c,
-                bd2c=bd2c,
-            ),
-            out_shape=jax.ShapeDtypeStruct((local_num_tokens, hidden_size),
-                                           t_dtype),
-            grid_spec=pltpu.PrefetchScalarGridSpec(
-                num_scalar_prefetch=0,
-                in_specs=[
-                    hbm_block_spec,  # tokens_hbm
-                    hbm_block_spec,  # w1_hbm
-                    hbm_block_spec,  # w2_hbm
-                    None
-                    if w1_scale is None else hbm_block_spec,  # w1_scale_hbm
-                    None
-                    if w2_scale is None else hbm_block_spec,  # w2_scale_hbm
-                    None if b1 is None else hbm_block_spec,  # b1_hbm
-                    None if b2 is None else hbm_block_spec,  # b2_hbm
-                    hbm_block_spec,  # gating_output_hbm
-                    hbm_block_spec,  # a2a_g_hbm
-                ],
-                out_specs=pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
-                scratch_shapes=([
-                    # t2e_routing_x2_smem
-                    pltpu.SMEM((2, bt, padded_top_k), jnp.int32),
-                    # d2e_count_x2_smem
-                    pltpu.SMEM((2, num_devices, 1, padded_num_experts),
-                               jnp.int32),
-                    # expert_offsets_x2_smem
-                    pltpu.SMEM((2, 2, padded_num_experts), jnp.int32),
-                    # expert_starts_x2_smem
-                    pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
-                    # expert_sizes_x2_smem
-                    pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
-                    # a2a_s_sends_x2_smem
-                    pltpu.SMEM((2, ), jnp.int32),
-                    # a2a_s_x2_vmem
-                    pltpu.VMEM(
-                        (
-                            2,
-                            bt * num_devices,
-                            t_packing,
-                            hidden_size // t_packing,
-                        ),
-                        t_dtype,
+    fused_moe = pl.pallas_call(
+        functools.partial(
+            _fused_ep_moe_kernel,
+            top_k=top_k,
+            renormalize_topk_logits=renormalize_topk_logits,
+            ep_axis_name=ep_axis_name,
+            act_fn=act_fn,
+            subc_quant_wsz=subc_quant_wsz,
+            bt=bt,
+            bf=bf,
+            bd1=bd1,
+            bd2=bd2,
+            btc=btc,
+            bfc=bfc,
+            bd1c=bd1c,
+            bd2c=bd2c,
+        ),
+        out_shape=jax.ShapeDtypeStruct((local_num_tokens, hidden_size),
+                                       t_dtype),
+        grid_spec=pltpu.PrefetchScalarGridSpec(
+            num_scalar_prefetch=0,
+            in_specs=[
+                hbm_block_spec,  # tokens_hbm
+                hbm_block_spec,  # w1_hbm
+                hbm_block_spec,  # w2_hbm
+                None if w1_scale is None else hbm_block_spec,  # w1_scale_hbm
+                None if w2_scale is None else hbm_block_spec,  # w2_scale_hbm
+                None if b1 is None else hbm_block_spec,  # b1_hbm
+                None if b2 is None else hbm_block_spec,  # b2_hbm
+                hbm_block_spec,  # gating_output_hbm
+                hbm_block_spec,  # a2a_g_hbm
+            ],
+            out_specs=pl.BlockSpec(memory_space=pltpu.MemorySpace.HBM),
+            scratch_shapes=([
+                # t2e_routing_x2_smem
+                pltpu.SMEM((2, bt, padded_top_k), jnp.int32),
+                # d2e_count_x2_smem
+                pltpu.SMEM((2, num_devices, 1, padded_num_experts), jnp.int32),
+                # expert_offsets_x2_smem
+                pltpu.SMEM((2, 2, padded_num_experts), jnp.int32),
+                # expert_starts_x2_smem
+                pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
+                # expert_sizes_x2_smem
+                pltpu.SMEM((2, 1, padded_num_experts), jnp.int32),
+                # a2a_s_sends_x2_smem
+                pltpu.SMEM((2, ), jnp.int32),
+                # a2a_s_x2_vmem
+                pltpu.VMEM(
+                    (
+                        2,
+                        bt * num_devices,
+                        t_packing,
+                        hidden_size // t_packing,
                     ),
-                    # a2a_s_acc_x2_vmem
-                    pltpu.VMEM(
-                        (
-                            2,
-                            bt * num_devices,
-                            t_packing,
-                            hidden_size // t_packing,
-                        ),
-                        t_dtype,
+                    t_dtype,
+                ),
+                # a2a_s_acc_x2_vmem
+                pltpu.VMEM(
+                    (
+                        2,
+                        bt * num_devices,
+                        t_packing,
+                        hidden_size // t_packing,
                     ),
-                    # a2a_g_acc_vmem
-                    pltpu.VMEM(
-                        (top_k, bt, t_packing, hidden_size // t_packing),
-                        t_dtype),
-                    # b_gating_x2_vmem
-                    pltpu.VMEM((2, bt, padded_num_experts), t_dtype),
-                    # b_output_x2_vmem
-                    pltpu.VMEM((2, bt, hidden_size), t_dtype),
-                    # b_w1_x2_vmem
-                    pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
-                    # b_w3_x2_vmem
-                    pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
-                    # b_w2_x2_vmem
-                    pltpu.VMEM((2, t_packing, bf, bd2 // t_packing), w2.dtype),
-                    # b_w1_scale_x2_vmem
-                    (None if w1_scale is None else pltpu.VMEM(
-                        (
-                            2,
-                            t_packing,
-                            bd1 // t_packing // subc_quant_wsz,
-                            1,
-                            bf,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_w3_scale_x2_vmem
-                    (None if w1_scale is None else pltpu.VMEM(
-                        (
-                            2,
-                            t_packing,
-                            bd1 // t_packing // subc_quant_wsz,
-                            1,
-                            bf,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_w2_scale_x2_vmem
-                    (None if w2_scale is None else pltpu.VMEM(
-                        (
-                            2,
-                            t_packing,
-                            bf // subc_quant_wsz,
-                            1,
-                            bd2 // t_packing,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_b1_x2_vmem
-                    (None if b1 is None else pltpu.VMEM(
-                        (
-                            2,
-                            1,
-                            bf,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_b3_x2_vmem
-                    (None if b1 is None else pltpu.VMEM(
-                        (
-                            2,
-                            1,
-                            bf,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_b2_x2_vmem
-                    (None if b2 is None else pltpu.VMEM(
-                        (
-                            2,
-                            t_packing,
-                            1,
-                            bd2 // t_packing,
-                        ),
-                        jnp.float32,
-                    )),
-                    # b_acc_vmem
-                    pltpu.VMEM((bt * num_devices, 1, bf * 2), jnp.float32),
-                    # local_sems
-                    pltpu.SemaphoreType.DMA((2, 5)),
-                    # send_sems
-                    pltpu.SemaphoreType.DMA((2, )),
-                    # recv_sems
-                    pltpu.SemaphoreType.DMA((2, )),
-                    # a2a_gather_sem
-                    pltpu.SemaphoreType.DMA,
-                    # a2a_acc_sem
-                    pltpu.SemaphoreType.DMA,
-                ]),
-            ),
-            compiler_params=pltpu.CompilerParams(
-                collective_id=0,
-                vmem_limit_bytes=100 * 1024 * 1024,
-            ),
-            name=scope_name,
-        ))
+                    t_dtype,
+                ),
+                # a2a_g_acc_vmem
+                pltpu.VMEM((top_k, bt, t_packing, hidden_size // t_packing),
+                           t_dtype),
+                # b_gating_x2_vmem
+                pltpu.VMEM((2, bt, padded_num_experts), t_dtype),
+                # b_output_x2_vmem
+                pltpu.VMEM((2, bt, hidden_size), t_dtype),
+                # b_w1_x2_vmem
+                pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
+                # b_w3_x2_vmem
+                pltpu.VMEM((2, t_packing, bd1 // t_packing, bf), w1.dtype),
+                # b_w2_x2_vmem
+                pltpu.VMEM((2, t_packing, bf, bd2 // t_packing), w2.dtype),
+                # b_w1_scale_x2_vmem
+                (None if w1_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bd1 // t_packing // subc_quant_wsz,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_w3_scale_x2_vmem
+                (None if w1_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bd1 // t_packing // subc_quant_wsz,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_w2_scale_x2_vmem
+                (None if w2_scale is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        bf // subc_quant_wsz,
+                        1,
+                        bd2 // t_packing,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b1_x2_vmem
+                (None if b1 is None else pltpu.VMEM(
+                    (
+                        2,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b3_x2_vmem
+                (None if b1 is None else pltpu.VMEM(
+                    (
+                        2,
+                        1,
+                        bf,
+                    ),
+                    jnp.float32,
+                )),
+                # b_b2_x2_vmem
+                (None if b2 is None else pltpu.VMEM(
+                    (
+                        2,
+                        t_packing,
+                        1,
+                        bd2 // t_packing,
+                    ),
+                    jnp.float32,
+                )),
+                # b_acc_vmem
+                pltpu.VMEM((bt * num_devices, 1, bf * 2), jnp.float32),
+                # local_sems
+                pltpu.SemaphoreType.DMA((2, 5)),
+                # send_sems
+                pltpu.SemaphoreType.DMA((2, )),
+                # recv_sems
+                pltpu.SemaphoreType.DMA((2, )),
+                # a2a_gather_sem
+                pltpu.SemaphoreType.DMA,
+                # a2a_acc_sem
+                pltpu.SemaphoreType.DMA,
+            ]),
+        ),
+        compiler_params=pltpu.CompilerParams(
+            collective_id=0,
+            vmem_limit_bytes=100 * 1024 * 1024,
+        ),
+        name=scope_name,
+    )
     @jax.jit
     @jax.shard_map(

tpu_inference/kernels/megablox/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu-inference 0.12.0.dev20251213__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.12.0.dev20251213py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl