PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511150811__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511150811__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (179) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_dp_scheduler.py +899 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/fused_moe_v1_test.py +105 -0
tests/kernels/mla_v1_test.py +396 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +549 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/conftest.py +32 -0
tests/lora/test_bgmv.py +43 -0
tests/lora/test_layers.py +654 -0
tests/lora/test_lora.py +133 -0
tests/lora/utils.py +96 -0
tests/test_base.py +201 -0
tests/test_envs.py +182 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +236 -0
tpu_inference/__init__.py +34 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/core/sched/__init__.py +0 -0
tpu_inference/core/sched/dp_scheduler.py +523 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/jax_parallel_state.py +67 -0
tpu_inference/distributed/tpu_connector.py +728 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +107 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +362 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +0 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +0 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1035 -0
tpu_inference/kernels/mla/__init__.py +0 -0
tpu_inference/kernels/mla/v1/__init__.py +0 -0
tpu_inference/kernels/mla/v1/kernel.py +1349 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1478 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4147 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +367 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +51 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_interface.py +390 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +8 -0
tpu_inference/layers/common/sharding.py +582 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +255 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +262 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +185 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +280 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +96 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +76 -0
tpu_inference/layers/jax/transformer_block.py +107 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +507 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +39 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +120 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +203 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +266 -0
tpu_inference/layers/vllm/quantization/unquantized.py +386 -0
tpu_inference/layers/vllm/sharding.py +230 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +311 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1219 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +444 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/gpt_oss.py +492 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +79 -0
tpu_inference/models/jax/llama3.py +375 -0
tpu_inference/models/jax/llama4.py +629 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +1103 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +163 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +5 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +6 -0
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +5 -0
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +6 -0
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +105 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +653 -0
tpu_inference/models/jax/utils/weight_utils.py +529 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +286 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_platform.py +269 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +780 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +132 -0
tpu_inference/runner/kv_cache_manager.py +479 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +217 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +248 -0
tpu_inference/runner/structured_decoding_manager.py +88 -0
tpu_inference/runner/tpu_runner.py +1620 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +367 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +317 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/tpu_worker.py +321 -0
tpu_inference-0.11.1.dev202511150811.dist-info/METADATA +107 -0
tpu_inference-0.11.1.dev202511150811.dist-info/RECORD +179 -0
tpu_inference-0.11.1.dev202511150811.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dev202511150811.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dev202511150811.dist-info/top_level.txt +2 -0

tpu_inference/layers/jax/moe/gpt_oss_moe.py ADDED Viewed

@@ -0,0 +1,185 @@
+from dataclasses import InitVar, dataclass
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from flax.typing import Sharding
+from jaxtyping import Float
+from tpu_inference.layers.jax.base import create_param
+from tpu_inference.layers.jax.layers import FlaxUtils
+from tpu_inference.layers.jax.moe.moe import Router
+modeling_flax_utils = FlaxUtils()
+@dataclass(kw_only=True)
+class GptOssRouter(Router):
+    """Router module for Mixture-of-Experts (MoE) layers.
+    This module determines which experts each token should be routed.
+    """
+    e_sharding: Sharding = ()
+    def __post_init__(self, rngs: nnx.Rngs):
+        """
+        Initializes the parent's kernel and adds the new bias parameter.
+        """
+        super().__post_init__(rngs)
+        self.bias_E = create_param(rngs,
+                                   shape=(self.num_experts, ),
+                                   dtype=self.dtype,
+                                   sharding=self.e_sharding,
+                                   random_init=self.random_init)
+    def __call__(self, x_TD: Float):
+        """
+        Overrides the parent's forward pass to include the bias.
+        """
+        x_TD = jnp.asarray(x_TD, self.dtype)
+        x_TD = nnx.with_sharding_constraint(x_TD, self.activation_ffw_td)
+        router_logits_TE = jnp.einsum('TD,DE -> TE', x_TD,
+                                      self.kernel_DE.value)
+        router_logits_TE += self.bias_E.value
+        weights_TX, selected_experts_TX = jax.lax.top_k(
+            router_logits_TE, self.num_experts_per_tok)
+        normalized_weights_TX = jax.nn.softmax(weights_TX.astype(self.dtype),
+                                               axis=-1)
+        return normalized_weights_TX, selected_experts_TX
+def _swiglu(x: Float, alpha: Float, limit: Float) -> Float:
+    """Implements the specific SwiGLU from the golden implementation."""
+    x_glu, x_linear = x[..., ::2], x[..., 1::2]
+    x_glu = jnp.clip(x_glu, a_max=limit)
+    x_linear = jnp.clip(x_linear, a_min=-limit, a_max=limit)
+    gated_activation = x_glu * jax.nn.sigmoid(alpha * x_glu)
+    return gated_activation * (x_linear + 1)
+@dataclass(kw_only=True)
+class CombineExperts(nnx.Module):
+    """Module for combining expert outputs with weighted sum."""
+    dtype: jnp.dtype
+    def __call__(self, down_proj_TED: Float, weights_TX: Float,
+                 indices_TX: jax.Array) -> Float:
+        """Combines expert outputs using weighted sum.
+        Args:
+            down_proj_TED: Expert outputs, shape (tokens, experts, hidden_dim)
+            weights_TX: Router weights, shape (tokens, experts_per_token)
+            indices_TX: Selected expert indices, shape (tokens, experts_per_token)
+        Returns:
+            Combined output, shape (tokens, hidden_dim)
+        """
+        with jax.named_scope("combine_experts"):
+            indices_for_gather = indices_TX[..., None]
+            gathered_down_proj_TED = jnp.take_along_axis(down_proj_TED,
+                                                         indices_for_gather,
+                                                         axis=1)
+            output_TD = jnp.einsum('TXD,TX -> TD', gathered_down_proj_TED,
+                                   weights_TX)
+        return output_TD.astype(self.dtype)
+@dataclass(kw_only=True)
+class GptOssMoE(nnx.Module):
+    """
+    JAX implementation of the GPT-OSS Mixture-of-Experts MLP block.
+    """
+    dtype: jnp.dtype
+    hidden_size: int
+    intermediate_size_moe: int
+    num_local_experts: int
+    router: GptOssRouter
+    rngs: InitVar[nnx.Rngs]
+    swiglu_limit: float = 7.0
+    swiglu_alpha: float = 1.702
+    # Sharding specifications
+    activation_ffw_td: Sharding
+    edf_sharding: Sharding
+    efd_sharding: Sharding
+    ed_sharding: Sharding
+    random_init: bool = False
+    def __call__(self, x_TD: Float) -> Float:
+        """Performs the forward pass for the GPT-OSS MoE layer."""
+        x_TD = jnp.asarray(x_TD, self.dtype)
+        x_TD = nnx.with_sharding_constraint(x_TD, self.activation_ffw_td)
+        weights_TX, indices_TX = self.router(x_TD)
+        # First MLP layer (up-projection)
+        with jax.named_scope("MLP #1"):
+            up_proj_TEF2 = jnp.einsum('TD,EDF -> TEF', x_TD,
+                                      self.mlp1_weight_EDF2.value)
+            up_proj_TEF2 += self.mlp1_bias_EF2.value
+            fuse_TEF = _swiglu(up_proj_TEF2,
+                               alpha=self.swiglu_alpha,
+                               limit=self.swiglu_limit)
+        # Second MLP layer (down-projection)
+        with jax.named_scope("MLP #2"):
+            down_proj_TED = jnp.einsum('TEF,EFD -> TED', fuse_TEF,
+                                       self.mlp2_weight_EFD.value)
+            down_proj_TED += self.mlp2_bias_ED.value
+        # Weighted sum of expert outputs
+        output_TD = self.combine_experts(down_proj_TED, weights_TX, indices_TX)
+        return output_TD
+    def __post_init__(self, rngs: nnx.Rngs):
+        """Initializes all weights and biases for the MoE block."""
+        D, F, E = self.hidden_size, self.intermediate_size_moe, self.num_local_experts
+        self.combine_experts = CombineExperts(dtype=self.dtype)
+        # MLP #1 Weights (Combined Gate and Up-projection) and Bias
+        self.mlp1_weight_EDF2 = create_param(
+            rngs,
+            shape=(E, D, F * 2),
+            dtype=self.dtype,
+            sharding=self.edf_sharding,
+            random_init=self.random_init,
+        )
+        self.mlp1_bias_EF2 = create_param(
+            rngs,
+            shape=(E, F * 2),
+            dtype=self.dtype,
+            sharding=self.ed_sharding,
+            random_init=self.random_init,
+        )
+        # MLP #2 Weights (Down-projection) and Bias
+        self.mlp2_weight_EFD = create_param(
+            rngs,
+            shape=(E, F, D),
+            dtype=self.dtype,
+            sharding=self.efd_sharding,
+            random_init=self.random_init,
+        )
+        self.mlp2_bias_ED = create_param(
+            rngs,
+            shape=(E, D),
+            dtype=self.dtype,
+            sharding=self.ed_sharding,
+            random_init=self.random_init,
+        )

tpu_inference/layers/jax/moe/moe.py ADDED Viewed

@@ -0,0 +1,209 @@
+from dataclasses import InitVar, dataclass
+import jax
+import jax.numpy as jnp
+from flax import nnx
+from flax.typing import Sharding
+from jaxtyping import Float
+from tpu_inference.layers.jax.base import create_param
+from tpu_inference.layers.jax.layers import FlaxUtils
+modeling_flax_utils = FlaxUtils()
+@dataclass(kw_only=True)
+class Router(nnx.Module):
+    """Router module for Mixture-of-Experts (MoE) layers.
+    This module determines which experts each token should be routed to based on the input.
+    Attributes:
+    """
+    dtype: jnp.dtype
+    hidden_size: int
+    num_experts: int
+    num_experts_per_tok: int
+    router_act: str
+    rngs: InitVar[nnx.Rngs]
+    activation_ffw_td: Sharding
+    ed_sharding: Sharding
+    random_init: bool = False
+    def __call__(self, x_TD: Float):
+        """Routes tokens to experts.
+        Args:
+            x_TD: Input array of shape (sequence_length, d_model).
+        Returns:
+            A tuple containing:
+                - normalized_weights_TX: Normalized weights for selected experts, shape (sequence_length, num_experts_per_tok).
+                - selected_experts_TX: Indices of selected experts, shape (sequence_length, num_experts_per_tok).
+        """
+        x_TD = jnp.asarray(x_TD, self.dtype)
+        x_TD = nnx.with_sharding_constraint(x_TD, self.activation_ffw_td)
+        router_act = modeling_flax_utils.ACT2FN[self.router_act]
+        router_logits_TE = jnp.einsum('TD,DE -> TE', x_TD,
+                                      self.kernel_DE.value)
+        weights_TX, selected_experts_TX = jax.lax.top_k(
+            router_logits_TE, self.num_experts_per_tok)
+        if self.router_act != "sigmoid":  # sigmoid does not accept axis argument.
+            normalized_weights_TX = router_act(weights_TX.astype(self.dtype),
+                                               axis=-1)
+        else:
+            normalized_weights_TX = router_act(weights_TX.astype(self.dtype))
+        return normalized_weights_TX, selected_experts_TX
+    def __post_init__(self, rngs: nnx.Rngs):
+        """Generates the router kernel (weights) for routing."""
+        shape = (self.hidden_size, self.num_experts)
+        self.kernel_DE = create_param(rngs,
+                                      shape=shape,
+                                      dtype=self.dtype,
+                                      sharding=self.ed_sharding,
+                                      random_init=self.random_init)
+@dataclass(kw_only=True)
+class MoE(nnx.Module):
+    """Mixture-of-Experts (MoE) Routed MLP Layer.
+    This module implements a MoE layer with a router and multiple expert MLPs.
+    Attributes:
+        router: The Router module.
+    """
+    dtype: jnp.dtype
+    num_local_experts: int
+    apply_expert_weight_before_computation: bool
+    hidden_size: int
+    intermediate_size_moe: int
+    hidden_act: str
+    rngs: InitVar[nnx.Rngs]
+    router: nnx.Module
+    activation_ffw_td: Sharding
+    activation_ffw_ted: Sharding
+    edf_sharding: Sharding
+    efd_sharding: Sharding
+    random_init: bool = False
+    def __call__(self, x_TD: Float):
+        """Performs the forward pass of the MoE layer.
+        Args:
+            x_TD: Input array of shape (sequence_length, d_model).
+        Returns:
+            Output array of shape (sequence_length, d_model) after passing through MoE.
+        """
+        x_TD = jnp.asarray(x_TD, self.dtype)
+        x_TD = nnx.with_sharding_constraint(x_TD, self.activation_ffw_td)
+        weights_TX, indices_TX = self.router(x_TD)
+        one_hot_indices_TXE = jax.nn.one_hot(
+            indices_TX, num_classes=self.num_local_experts, dtype=self.dtype)
+        full_weights_TE = jnp.sum(one_hot_indices_TXE * weights_TX[..., None],
+                                  axis=1)
+        # Some models use the routing scores to weight the data instead of
+        # weighting the expert outputs.
+        if self.apply_expert_weight_before_computation:
+            with jax.named_scope("pre_computing_weight"):
+                return self._moe_fwd_preapply_router_weights(
+                    x_TD, full_weights_TE)
+        else:
+            return self._moe_fwd(x_TD, full_weights_TE)
+    def __post_init__(self, rngs: nnx.Rngs):
+        """Generates the kernels (weights) for the router and experts (gating, up-projection, and down-projection layers)."""
+        D = self.hidden_size
+        F = self.intermediate_size_moe
+        shape_gating = (self.num_local_experts, D, F)
+        shape_up = (self.num_local_experts, D, F)
+        shape_down = (self.num_local_experts, F, D)
+        self.kernel_gating_EDF = create_param(rngs,
+                                              shape=shape_gating,
+                                              dtype=self.dtype,
+                                              sharding=self.edf_sharding,
+                                              random_init=self.random_init)
+        self.kernel_up_proj_EDF = create_param(rngs,
+                                               shape=shape_up,
+                                               dtype=self.dtype,
+                                               sharding=self.edf_sharding,
+                                               random_init=self.random_init)
+        self.kernel_down_proj_EFD = create_param(rngs,
+                                                 shape=shape_down,
+                                                 dtype=self.dtype,
+                                                 sharding=self.efd_sharding,
+                                                 random_init=self.random_init)
+    def _moe_fwd_preapply_router_weights(self, x_TD: jax.Array, weights_TE):
+        """Performs the forward pass of the MoE experts with router weights pre-applied to the inputs.
+        Args:
+            x_TD: Input array for the experts, shape (sequence_length, hidden_size).
+            weights_TE: Router weights, shape (sequence_length, num_experts).
+        Returns:
+            Output array of shape (sequence_length, d_model).
+        """
+        # Data needs to be replicated since it will be weighted by the router
+        # scores before being passed to each expert.
+        num_experts = weights_TE.shape[-1]
+        x_TED = jnp.repeat(x_TD[:, None, :], num_experts, 1)
+        weights_TED = weights_TE[..., None]
+        x_TED = jnp.asarray(x_TED, self.dtype)
+        with jax.named_scope("activation_expert_weighting"):
+            x_TED = x_TED * weights_TED
+        x_TED = nnx.with_sharding_constraint(x_TED, self.activation_ffw_ted)
+        with jax.named_scope("gating"):
+            gating_TEF = jnp.einsum('TED,EDF -> TEF', x_TED,
+                                    self.kernel_gating_EDF.value)
+            activated_gating_TEF = modeling_flax_utils.ACT2FN[self.hidden_act](
+                gating_TEF)
+        with jax.named_scope("up_projection"):
+            up_proj_TEF = jnp.einsum('TED,EDF -> TEF', x_TED,
+                                     self.kernel_up_proj_EDF.value)
+        fuse_TEF = activated_gating_TEF * up_proj_TEF
+        with jax.named_scope("down_projection"):
+            down_proj_TED = jnp.einsum('TEF,EFD -> TED', fuse_TEF,
+                                       self.kernel_down_proj_EFD.value)
+        with jax.named_scope("sum"):
+            output_TD = down_proj_TED.sum(axis=1)
+        return output_TD.astype(self.dtype)
+    def _moe_fwd(self, x_TD: Float, weights):
+        """Performs the basic forward pass of the MoE experts without dropping or megablocks.
+        Args:
+            x_TD: Input array for the experts, shape (sequence_length, d_model).
+            weights: Weights for combining expert outputs, shape (sequence_length, num_experts).
+        Returns:
+            Output array of shape (sequence_length, d_model).
+        """
+        x_TD = jnp.asarray(x_TD, self.dtype)
+        x_TD = nnx.with_sharding_constraint(x_TD, self.activation_ffw_td)
+        with jax.named_scope("gating"):
+            gating_TEF = jnp.einsum('TD,EDF -> TEF', x_TD,
+                                    self.kernel_gating_EDF.value)
+            activated_gating_TEF = modeling_flax_utils.ACT2FN[self.hidden_act](
+                gating_TEF)
+        with jax.named_scope("up_projection"):
+            up_proj_TEF = jnp.einsum('TD,EDF -> TEF', x_TD,
+                                     self.kernel_up_proj_EDF.value)
+        fuse_TEF = activated_gating_TEF * up_proj_TEF
+        with jax.named_scope("down_projection"):
+            down_proj_TED = jnp.einsum('TEF,EFD -> TED', fuse_TEF,
+                                       self.kernel_down_proj_EFD.value)
+        with jax.named_scope("sum"):
+            output_TD = jnp.einsum('TED,TE -> TD', down_proj_TED, weights)
+        return output_TD.astype(self.dtype)

tpu_inference/layers/jax/rope.py ADDED Viewed

@@ -0,0 +1,280 @@
+import math
+from dataclasses import dataclass, field
+from typing import Optional, Tuple
+import jax
+from flax import nnx
+from jax import numpy as jnp
+from jax.experimental.layout import Layout, with_layout_constraint
+from jax.sharding import NamedSharding, PartitionSpec
+@dataclass(kw_only=True)
+class RotaryEmbedding(nnx.Module):
+    """
+    An implementation of the original rotary positional embedding.
+    """
+    rotary_dim: int
+    rope_theta: float
+    original_max_position_embeddings: int
+    dtype: jnp.dtype
+    sin_cos_cache: Optional[jax.Array] = field(init=False, default=None)
+    def initialize_cache(self):
+        """Computes and caches the sin/cos embeddings."""
+        if self.sin_cos_cache is None:
+            self.sin_cos_cache = self._compute_sin_cos()
+    def _compute_inv_freq(self):
+        fractions_H = jnp.arange(0, self.rotary_dim, 2,
+                                 dtype=jnp.float32) / self.rotary_dim
+        inv_freq_H = 1.0 / (self.rope_theta**fractions_H)
+        return inv_freq_H
+    def _compute_sin_cos(self):
+        inv_freq_H = self._compute_inv_freq()
+        t = jnp.arange(self.original_max_position_embeddings,
+                       dtype=jnp.float32)
+        freqs = jnp.einsum("...T,k->...Tk",
+                           t,
+                           inv_freq_H,
+                           precision=jax.lax.Precision.HIGHEST)
+        sin, cos = jnp.sin(freqs), jnp.cos(freqs)
+        cache = jnp.concatenate((cos, sin), axis=-1)
+        return cache
+    def apply_rope(self, positions: jax.Array, x_TNH: jax.Array):
+        assert x_TNH.ndim == 3
+        assert self.sin_cos_cache is not None, "RoPE cache not initialized."
+        cos_sin_TH = self.sin_cos_cache[positions]
+        # cos, sin: (T, H/2)
+        cos_TH, sin_TH = jnp.split(cos_sin_TH, 2, axis=-1)
+        assert sin_TH.ndim == 2 and cos_TH.ndim == 2
+        # cos, sin: (T, 1, H/2)
+        cos_T1H, sin_T1H = cos_TH[:, None, :], sin_TH[:, None, :]
+        # first_half, second_half: (T, N, H/2)
+        first_half_TNH, second_half_TNH = jnp.split(x_TNH, 2, axis=-1)
+        combined = jnp.concatenate([
+            first_half_TNH * cos_T1H - second_half_TNH * sin_T1H,
+            second_half_TNH * cos_T1H + first_half_TNH * sin_T1H
+        ],
+                                   axis=-1)
+        return combined.astype(self.dtype)
+@dataclass(kw_only=True)
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """
+    Rotary Embedding for deepseek, with scaling and YaRN method.
+    """
+    scaling_factor: float
+    beta_fast: int = 32
+    beta_slow: int = 1
+    mscale_value: float = 1
+    mscale_all_dim: float = 0
+    def initialize_cache(self, mesh: jax.sharding.Mesh):
+        """Computes and caches the sin/cos embeddings."""
+        # The second condition is for the Qwix case, where we need to call `initialize_cache` on
+        # the abstract model.  Thus, when we go to call `initialize_cache` on the concrete model,
+        # this method will have been called already, but we need to recompute the cache so that
+        # it's concrete (otherwise, it'll still be a jax.ShapeDtypeStruct).
+        if self.sin_cos_cache is not None and not isinstance(
+                self.sin_cos_cache, jax.ShapeDtypeStruct):
+            return
+        mscale_val = _yarn_get_mscale(
+            self.scaling_factor, self.mscale_value) / _yarn_get_mscale(
+                self.scaling_factor, self.mscale_all_dim)
+        replicated_sharding = NamedSharding(mesh, PartitionSpec())
+        self.mscale = jax.device_put(mscale_val, replicated_sharding)
+        self.sin_cos_cache = self._compute_sin_cos()
+    def _compute_inv_freq(self):
+        fractions = jnp.arange(0, self.rotary_dim, 2,
+                               dtype=jnp.float32) / self.rotary_dim
+        inv_freq_extrapolation = 1.0 / (self.rope_theta**fractions)
+        inv_freq_interpolation = 1.0 / (self.scaling_factor *
+                                        self.rope_theta**fractions)
+        low, high = _yarn_find_correction_range(
+            self.beta_fast, self.beta_slow, self.rotary_dim, self.rope_theta,
+            self.original_max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = 1 - _yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2).astype(jnp.float32)
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+    @jax.jit
+    def _compute_sin_cos(self):
+        inv_freq_H = self._compute_inv_freq()
+        t = jnp.arange(self.original_max_position_embeddings *
+                       self.scaling_factor,
+                       dtype=jnp.float32)
+        freqs = jnp.einsum("...T,k->...Tk", t, inv_freq_H)
+        sin, cos = jnp.sin(freqs) * self.mscale, jnp.cos(freqs) * self.mscale
+        cache = jnp.concatenate((cos, sin), axis=-1)
+        H = cache.shape[1]
+        target_dim = ((H - 1) // 128 + 1) * 128
+        padding_amount = target_dim - self.rotary_dim
+        pad_width = ((0, 0), (0, padding_amount))
+        cache_padded = jnp.pad(cache, pad_width, mode='constant')
+        desired_layout = Layout(major_to_minor=(1, 0))
+        cache_padded = with_layout_constraint(cache_padded, desired_layout)
+        return cache_padded
+    def apply_rope(self, positions: jax.Array, x_TNH: jax.Array):
+        assert x_TNH.ndim == 3
+        assert self.sin_cos_cache is not None, "RoPE cache not initialized."
+        cos_sin_padded = self.sin_cos_cache[positions]
+        cos_sin_TH = cos_sin_padded[:, :self.rotary_dim]
+        # cos, sin: (T, H/2)
+        cos_TH, sin_TH = jnp.split(cos_sin_TH, 2, axis=-1)
+        assert sin_TH.ndim == 2 and cos_TH.ndim == 2
+        # cos, sin: (T, 1, H/2)
+        cos_T1H, sin_T1H = cos_TH[:, None, :], sin_TH[:, None, :]
+        # even, odd: (T, N, H/2)
+        even_TNH, odd_TNH = x_TNH[..., ::2], x_TNH[..., 1::2]
+        combined_TNH = jnp.stack([
+            even_TNH * cos_T1H - odd_TNH * sin_T1H,
+            odd_TNH * cos_T1H + even_TNH * sin_T1H
+        ],
+                                 axis=-1).reshape(x_TNH.shape)
+        return combined_TNH.astype(self.dtype)
+# Calculates the temperature scaling factor for YaRN to adjust
+# RoPE embedding magnitudes.
+def _yarn_get_mscale(scale, mscale):
+    return jnp.where(scale <= 1, 1.0, 0.1 * mscale * jnp.log(scale) + 1.0)
+# Inverses dim formula to find dim based on number of rotations.
+def _yarn_find_correction_dim(num_rotations,
+                              dim,
+                              base=10000,
+                              max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings /
+                           (num_rotations * 2 * math.pi))) / (2 *
+                                                              math.log(base))
+# Finds dim range bounds based on rotations.
+def _yarn_find_correction_range(low_rot,
+                                high_rot,
+                                dim,
+                                base=10000,
+                                max_position_embeddings=2048):
+    low = math.floor(
+        _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(
+        _yarn_find_correction_dim(high_rot, dim, base,
+                                  max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+# Creates a 1D mask that ramps linearly from 0 to 1 between min and max indices.
+def _yarn_linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+    linear_func = (jnp.arange(dim, dtype=jnp.float32) - min) / (max - min)
+    ramp_func = jnp.clip(linear_func, 0, 1)
+    return ramp_func
+@dataclass(kw_only=True)
+class GptOssRotaryEmbedding(nnx.Module):
+    """
+    JAX implementation of the Rotary Positional Embedding with YaRN scaling.
+    """
+    head_dim: int
+    rope_theta: float
+    dtype: jnp.dtype
+    initial_context_length: int = 4096
+    rope_scaling_factor: float = 1.0
+    rope_ntk_alpha: float = 1.0
+    rope_ntk_beta: float = 32.0
+    def _compute_concentration_and_inv_freq(self) -> Tuple[float, jax.Array]:
+        """
+        Computes the inverse frequencies and concentration factor for YaRN.
+        See YaRN paper: https://arxiv.org/abs/2309.00071
+        """
+        freq = self.rope_theta**(
+            jnp.arange(0, self.head_dim, 2, dtype=jnp.float32) / self.head_dim)
+        if self.rope_scaling_factor > 1.0:
+            concentration = 0.1 * jnp.log(self.rope_scaling_factor) + 1.0
+            d_half = self.head_dim / 2
+            # NTK by parts
+            low = (d_half * jnp.log(self.initial_context_length /
+                                    (self.rope_ntk_beta * 2 * jnp.pi)) /
+                   jnp.log(self.rope_theta))
+            high = (d_half * jnp.log(self.initial_context_length /
+                                     (self.rope_ntk_alpha * 2 * jnp.pi)) /
+                    jnp.log(self.rope_theta))
+            interpolation = 1.0 / (self.rope_scaling_factor * freq)
+            extrapolation = 1.0 / freq
+            ramp = (jnp.arange(d_half, dtype=jnp.float32) - low) / (high - low)
+            mask = 1 - jnp.clip(ramp, 0, 1)
+            inv_freq = interpolation * (1 - mask) + extrapolation * mask
+        else:
+            concentration = 1.0
+            inv_freq = 1.0 / freq
+        return concentration, inv_freq
+    def _compute_cos_sin(self,
+                         positions: jax.Array) -> Tuple[jax.Array, jax.Array]:
+        """Computes cosine and sine embeddings for given positions."""
+        concentration, inv_freq_H = self._compute_concentration_and_inv_freq()
+        # freqs: (T, H/2)
+        freqs = jnp.einsum("T,H->TH",
+                           positions.astype(jnp.float32),
+                           inv_freq_H,
+                           precision=jax.lax.Precision.HIGHEST)
+        cos = jnp.cos(freqs) * concentration
+        sin = jnp.sin(freqs) * concentration
+        return cos, sin
+    def __call__(self, query_TNH: jax.Array, key_TNH: jax.Array,
+                 positions: jax.Array) -> Tuple[jax.Array, jax.Array]:
+        """
+        Applies rotary embeddings to query and key tensors.
+        Args:
+            query_TNH: Query tensor with shape (num_tokens, num_heads, head_dim)
+            key_TNH: Key tensor with shape (num_tokens, num_kv_heads, head_dim)
+            positions: A 1D array of token positions.
+        """
+        # cos, sin: (T, H/2)
+        cos_TH, sin_TH = self._compute_cos_sin(positions)
+        # Reshape for broadcasting: (T, 1, H/2)
+        cos_T1H = cos_TH[:, None, :]
+        sin_T1H = sin_TH[:, None, :]
+        def _apply_rotation(x_TNH: jax.Array) -> jax.Array:
+            # Split the last dimension
+            first_half, second_half = jnp.split(x_TNH, 2, axis=-1)
+            # Apply rotation
+            rotated_x = jnp.concatenate([
+                first_half * cos_T1H - second_half * sin_T1H,
+                second_half * cos_T1H + first_half * sin_T1H
+            ],
+                                        axis=-1)
+            return rotated_x.astype(self.dtype)
+        rotated_query = _apply_rotation(query_TNH)
+        rotated_key = _apply_rotation(key_TNH)
+        return rotated_query, rotated_key