PyPI - tpu-inference - Versions diffs - 0.12.0.dev20251222__py3-none-any.whl - Mend

tpu-inference 0.12.0.dev20251222__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (260) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_core_tpu.py +513 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +67 -0
tests/core/test_dp_scheduler.py +724 -0
tests/core/test_init.py +63 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +393 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +291 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +388 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +498 -0
tests/kernels/quantized_matmul_kernel_test.py +159 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +248 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +414 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +565 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +520 -0
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/layers/jax/test_qwix.py +969 -0
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +403 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +426 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +297 -0
tests/layers/vllm/test_unquantized.py +621 -0
tests/layers/vllm/utils.py +72 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +46 -0
tests/lora/test_bgmv.py +57 -0
tests/lora/test_layers.py +666 -0
tests/lora/test_lora.py +147 -0
tests/lora/test_lora_perf.py +67 -0
tests/lora/utils.py +88 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +606 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +202 -0
tests/runner/test_tpu_runner_dp.py +1033 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +215 -0
tests/test_envs.py +280 -0
tests/test_tpu_info.py +134 -0
tests/test_utils.py +193 -0
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +67 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/core_tpu.py +786 -0
tpu_inference/core/disagg_executor.py +118 -0
tpu_inference/core/disagg_utils.py +49 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +814 -0
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +81 -0
tpu_inference/distributed/tpu_connector.py +732 -0
tpu_inference/distributed/utils.py +112 -0
tpu_inference/env_override.py +9 -0
tpu_inference/envs.py +191 -0
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +399 -0
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +272 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +741 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +65 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +1612 -0
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +1340 -0
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +456 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +876 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +288 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1594 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1586 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4460 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +548 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +65 -0
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +403 -0
tpu_inference/layers/common/attention_metadata.py +48 -0
tpu_inference/layers/common/binary_search.py +295 -0
tpu_inference/layers/common/quant_methods.py +23 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +600 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +268 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +547 -0
tpu_inference/layers/jax/attention/gpt_oss_attention.py +275 -0
tpu_inference/layers/jax/attention/llama4_attention.py +167 -0
tpu_inference/layers/jax/base.py +165 -0
tpu_inference/layers/jax/constants.py +101 -0
tpu_inference/layers/jax/layers.py +315 -0
tpu_inference/layers/jax/misc.py +30 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +615 -0
tpu_inference/layers/jax/moe/gpt_oss_moe.py +199 -0
tpu_inference/layers/jax/moe/moe.py +249 -0
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +294 -0
tpu_inference/layers/jax/rope_interface.py +228 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +528 -0
tpu_inference/layers/jax/sample/sampling.py +110 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +90 -0
tpu_inference/layers/jax/transformer_block.py +121 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +221 -0
tpu_inference/layers/vllm/fused_moe.py +502 -0
tpu_inference/layers/vllm/linear_common.py +221 -0
tpu_inference/layers/vllm/quantization/__init__.py +55 -0
tpu_inference/layers/vllm/quantization/awq.py +221 -0
tpu_inference/layers/vllm/quantization/common.py +124 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +135 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +266 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +222 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +150 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +396 -0
tpu_inference/layers/vllm/quantization/unquantized.py +416 -0
tpu_inference/layers/vllm/sharding.py +244 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +98 -0
tpu_inference/lora/torch_punica_tpu.py +310 -0
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +520 -0
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +978 -0
tpu_inference/models/jax/gpt_oss.py +508 -0
tpu_inference/models/jax/jax_intermediate_tensor.py +93 -0
tpu_inference/models/jax/llama3.py +436 -0
tpu_inference/models/jax/llama4.py +643 -0
tpu_inference/models/jax/llama_eagle3.py +350 -0
tpu_inference/models/jax/llama_guard_4.py +375 -0
tpu_inference/models/jax/qwen2.py +390 -0
tpu_inference/models/jax/qwen2_5_vl.py +1232 -0
tpu_inference/models/jax/qwen3.py +318 -0
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +110 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +177 -0
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/qwix/qwix_utils.py +713 -0
tpu_inference/models/jax/utils/weight_utils.py +621 -0
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +307 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +59 -0
tpu_inference/platforms/__init__.py +16 -0
tpu_inference/platforms/tpu_platform.py +258 -0
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/block_table.py +122 -0
tpu_inference/runner/compilation_manager.py +890 -0
tpu_inference/runner/input_batch.py +435 -0
tpu_inference/runner/kv_cache.py +166 -0
tpu_inference/runner/kv_cache_manager.py +508 -0
tpu_inference/runner/lora_utils.py +106 -0
tpu_inference/runner/multimodal_manager.py +231 -0
tpu_inference/runner/persistent_batch_manager.py +296 -0
tpu_inference/runner/speculative_decoding_manager.py +262 -0
tpu_inference/runner/structured_decoding_manager.py +101 -0
tpu_inference/runner/tpu_runner.py +1768 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +430 -0
tpu_inference/tpu_info.py +92 -0
tpu_inference/utils.py +345 -0
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +468 -0
tpu_inference-0.12.0.dev20251222.dist-info/METADATA +106 -0
tpu_inference-0.12.0.dev20251222.dist-info/RECORD +260 -0
tpu_inference-0.12.0.dev20251222.dist-info/WHEEL +5 -0
tpu_inference-0.12.0.dev20251222.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.12.0.dev20251222.dist-info/top_level.txt +2 -0

tpu_inference/layers/jax/rope.py ADDED Viewed

@@ -0,0 +1,294 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass, field
+from typing import Optional, Tuple
+import jax
+from flax import nnx
+from jax import numpy as jnp
+from jax.experimental.layout import Layout, with_layout_constraint
+from jax.sharding import NamedSharding, PartitionSpec
+@dataclass(kw_only=True)
+class RotaryEmbedding(nnx.Module):
+    """
+    An implementation of the original rotary positional embedding.
+    """
+    rotary_dim: int
+    rope_theta: float
+    original_max_position_embeddings: int
+    dtype: jnp.dtype
+    sin_cos_cache: Optional[jax.Array] = field(init=False, default=None)
+    def initialize_cache(self):
+        """Computes and caches the sin/cos embeddings."""
+        if self.sin_cos_cache is None:
+            self.sin_cos_cache = self._compute_sin_cos()
+    def _compute_inv_freq(self):
+        fractions_H = jnp.arange(0, self.rotary_dim, 2,
+                                 dtype=jnp.float32) / self.rotary_dim
+        inv_freq_H = 1.0 / (self.rope_theta**fractions_H)
+        return inv_freq_H
+    def _compute_sin_cos(self):
+        inv_freq_H = self._compute_inv_freq()
+        t = jnp.arange(self.original_max_position_embeddings,
+                       dtype=jnp.float32)
+        freqs = jnp.einsum("...T,k->...Tk",
+                           t,
+                           inv_freq_H,
+                           precision=jax.lax.Precision.HIGHEST)
+        sin, cos = jnp.sin(freqs), jnp.cos(freqs)
+        cache = jnp.concatenate((cos, sin), axis=-1)
+        return cache
+    def apply_rope(self, positions: jax.Array, x_TNH: jax.Array):
+        assert x_TNH.ndim == 3
+        assert self.sin_cos_cache is not None, "RoPE cache not initialized."
+        cos_sin_TH = self.sin_cos_cache[positions]
+        # cos, sin: (T, H/2)
+        cos_TH, sin_TH = jnp.split(cos_sin_TH, 2, axis=-1)
+        assert sin_TH.ndim == 2 and cos_TH.ndim == 2
+        # cos, sin: (T, 1, H/2)
+        cos_T1H, sin_T1H = cos_TH[:, None, :], sin_TH[:, None, :]
+        # first_half, second_half: (T, N, H/2)
+        first_half_TNH, second_half_TNH = jnp.split(x_TNH, 2, axis=-1)
+        combined = jnp.concatenate([
+            first_half_TNH * cos_T1H - second_half_TNH * sin_T1H,
+            second_half_TNH * cos_T1H + first_half_TNH * sin_T1H
+        ],
+                                   axis=-1)
+        return combined.astype(self.dtype)
+@dataclass(kw_only=True)
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """
+    Rotary Embedding for deepseek, with scaling and YaRN method.
+    """
+    scaling_factor: float
+    beta_fast: int = 32
+    beta_slow: int = 1
+    mscale_value: float = 1
+    mscale_all_dim: float = 0
+    def initialize_cache(self, mesh: jax.sharding.Mesh):
+        """Computes and caches the sin/cos embeddings."""
+        # The second condition is for the Qwix case, where we need to call `initialize_cache` on
+        # the abstract model.  Thus, when we go to call `initialize_cache` on the concrete model,
+        # this method will have been called already, but we need to recompute the cache so that
+        # it's concrete (otherwise, it'll still be a jax.ShapeDtypeStruct).
+        if self.sin_cos_cache is not None and not isinstance(
+                self.sin_cos_cache, jax.ShapeDtypeStruct):
+            return
+        mscale_val = _yarn_get_mscale(
+            self.scaling_factor, self.mscale_value) / _yarn_get_mscale(
+                self.scaling_factor, self.mscale_all_dim)
+        replicated_sharding = NamedSharding(mesh, PartitionSpec())
+        self.mscale = jax.device_put(mscale_val, replicated_sharding)
+        self.sin_cos_cache = self._compute_sin_cos()
+    def _compute_inv_freq(self):
+        fractions = jnp.arange(0, self.rotary_dim, 2,
+                               dtype=jnp.float32) / self.rotary_dim
+        inv_freq_extrapolation = 1.0 / (self.rope_theta**fractions)
+        inv_freq_interpolation = 1.0 / (self.scaling_factor *
+                                        self.rope_theta**fractions)
+        low, high = _yarn_find_correction_range(
+            self.beta_fast, self.beta_slow, self.rotary_dim, self.rope_theta,
+            self.original_max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = 1 - _yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2).astype(jnp.float32)
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+    @jax.jit
+    def _compute_sin_cos(self):
+        inv_freq_H = self._compute_inv_freq()
+        t = jnp.arange(self.original_max_position_embeddings *
+                       self.scaling_factor,
+                       dtype=jnp.float32)
+        freqs = jnp.einsum("...T,k->...Tk", t, inv_freq_H)
+        sin, cos = jnp.sin(freqs) * self.mscale, jnp.cos(freqs) * self.mscale
+        cache = jnp.concatenate((cos, sin), axis=-1)
+        H = cache.shape[1]
+        target_dim = ((H - 1) // 128 + 1) * 128
+        padding_amount = target_dim - self.rotary_dim
+        pad_width = ((0, 0), (0, padding_amount))
+        cache_padded = jnp.pad(cache, pad_width, mode='constant')
+        desired_layout = Layout(major_to_minor=(1, 0))
+        cache_padded = with_layout_constraint(cache_padded, desired_layout)
+        return cache_padded
+    def apply_rope(self, positions: jax.Array, x_TNH: jax.Array):
+        assert x_TNH.ndim == 3
+        assert self.sin_cos_cache is not None, "RoPE cache not initialized."
+        cos_sin_padded = self.sin_cos_cache[positions]
+        cos_sin_TH = cos_sin_padded[:, :self.rotary_dim]
+        # cos, sin: (T, H/2)
+        cos_TH, sin_TH = jnp.split(cos_sin_TH, 2, axis=-1)
+        assert sin_TH.ndim == 2 and cos_TH.ndim == 2
+        # cos, sin: (T, 1, H/2)
+        cos_T1H, sin_T1H = cos_TH[:, None, :], sin_TH[:, None, :]
+        # even, odd: (T, N, H/2)
+        even_TNH, odd_TNH = x_TNH[..., ::2], x_TNH[..., 1::2]
+        combined_TNH = jnp.stack([
+            even_TNH * cos_T1H - odd_TNH * sin_T1H,
+            odd_TNH * cos_T1H + even_TNH * sin_T1H
+        ],
+                                 axis=-1).reshape(x_TNH.shape)
+        return combined_TNH.astype(self.dtype)
+# Calculates the temperature scaling factor for YaRN to adjust
+# RoPE embedding magnitudes.
+def _yarn_get_mscale(scale, mscale):
+    return jnp.where(scale <= 1, 1.0, 0.1 * mscale * jnp.log(scale) + 1.0)
+# Inverses dim formula to find dim based on number of rotations.
+def _yarn_find_correction_dim(num_rotations,
+                              dim,
+                              base=10000,
+                              max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings /
+                           (num_rotations * 2 * math.pi))) / (2 *
+                                                              math.log(base))
+# Finds dim range bounds based on rotations.
+def _yarn_find_correction_range(low_rot,
+                                high_rot,
+                                dim,
+                                base=10000,
+                                max_position_embeddings=2048):
+    low = math.floor(
+        _yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(
+        _yarn_find_correction_dim(high_rot, dim, base,
+                                  max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+# Creates a 1D mask that ramps linearly from 0 to 1 between min and max indices.
+def _yarn_linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+    linear_func = (jnp.arange(dim, dtype=jnp.float32) - min) / (max - min)
+    ramp_func = jnp.clip(linear_func, 0, 1)
+    return ramp_func
+@dataclass(kw_only=True)
+class GptOssRotaryEmbedding(nnx.Module):
+    """
+    JAX implementation of the Rotary Positional Embedding with YaRN scaling.
+    """
+    head_dim: int
+    rope_theta: float
+    dtype: jnp.dtype
+    initial_context_length: int = 4096
+    rope_scaling_factor: float = 1.0
+    rope_ntk_alpha: float = 1.0
+    rope_ntk_beta: float = 32.0
+    def _compute_concentration_and_inv_freq(self) -> Tuple[float, jax.Array]:
+        """
+        Computes the inverse frequencies and concentration factor for YaRN.
+        See YaRN paper: https://arxiv.org/abs/2309.00071
+        """
+        freq = self.rope_theta**(
+            jnp.arange(0, self.head_dim, 2, dtype=jnp.float32) / self.head_dim)
+        if self.rope_scaling_factor > 1.0:
+            concentration = 0.1 * jnp.log(self.rope_scaling_factor) + 1.0
+            d_half = self.head_dim / 2
+            # NTK by parts
+            low = (d_half * jnp.log(self.initial_context_length /
+                                    (self.rope_ntk_beta * 2 * jnp.pi)) /
+                   jnp.log(self.rope_theta))
+            high = (d_half * jnp.log(self.initial_context_length /
+                                     (self.rope_ntk_alpha * 2 * jnp.pi)) /
+                    jnp.log(self.rope_theta))
+            interpolation = 1.0 / (self.rope_scaling_factor * freq)
+            extrapolation = 1.0 / freq
+            ramp = (jnp.arange(d_half, dtype=jnp.float32) - low) / (high - low)
+            mask = 1 - jnp.clip(ramp, 0, 1)
+            inv_freq = interpolation * (1 - mask) + extrapolation * mask
+        else:
+            concentration = 1.0
+            inv_freq = 1.0 / freq
+        return concentration, inv_freq
+    def _compute_cos_sin(self,
+                         positions: jax.Array) -> Tuple[jax.Array, jax.Array]:
+        """Computes cosine and sine embeddings for given positions."""
+        concentration, inv_freq_H = self._compute_concentration_and_inv_freq()
+        # freqs: (T, H/2)
+        freqs = jnp.einsum("T,H->TH",
+                           positions.astype(jnp.float32),
+                           inv_freq_H,
+                           precision=jax.lax.Precision.HIGHEST)
+        cos = jnp.cos(freqs) * concentration
+        sin = jnp.sin(freqs) * concentration
+        return cos, sin
+    def __call__(self, query_TNH: jax.Array, key_TNH: jax.Array,
+                 positions: jax.Array) -> Tuple[jax.Array, jax.Array]:
+        """
+        Applies rotary embeddings to query and key tensors.
+        Args:
+            query_TNH: Query tensor with shape (num_tokens, num_heads, head_dim)
+            key_TNH: Key tensor with shape (num_tokens, num_kv_heads, head_dim)
+            positions: A 1D array of token positions.
+        """
+        # cos, sin: (T, H/2)
+        cos_TH, sin_TH = self._compute_cos_sin(positions)
+        # Reshape for broadcasting: (T, 1, H/2)
+        cos_T1H = cos_TH[:, None, :]
+        sin_T1H = sin_TH[:, None, :]
+        def _apply_rotation(x_TNH: jax.Array) -> jax.Array:
+            # Split the last dimension
+            first_half, second_half = jnp.split(x_TNH, 2, axis=-1)
+            # Apply rotation
+            rotated_x = jnp.concatenate([
+                first_half * cos_T1H - second_half * sin_T1H,
+                second_half * cos_T1H + first_half * sin_T1H
+            ],
+                                        axis=-1)
+            return rotated_x.astype(self.dtype)
+        rotated_query = _apply_rotation(query_TNH)
+        rotated_key = _apply_rotation(key_TNH)
+        return rotated_query, rotated_key

tpu_inference/layers/jax/rope_interface.py ADDED Viewed

@@ -0,0 +1,228 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any, Dict
+import jax
+import jax.numpy as jnp
+def apply_rope(
+    # (seq_len, num_heads, head_dim)
+    inputs: jax.Array,
+    # (3, seq_len) for M-RoPE, otherwise (seq_len,)
+    positions: jax.Array,
+    head_dim: int,
+    rope_theta: float = 10000,
+    rope_scaling: Dict[str, Any] = None,
+    rope_input_ordering: str = "split",
+) -> jax.Array:
+    """
+    Applies Rotary Positional Embedding using the sine and cosine strategy.
+    This implementation assumes the input tensor has a shape that might include
+    padding on the last dimension (head_dim).
+    RoPE is applied only to the first `head_dim` features, and the result is
+    padded back to the original dimension if necessary.
+    If rope_input_ordering is "split", then the input pairs for rotation are taken one from the
+    first and one from the second half of the head_dim. If it is "interleaved" then
+    adjacent values are used as inputs for rotation.
+    """
+    # M-RoPE support for Qwen2.5-VL
+    if positions.ndim == 2 and positions.shape[0] == 3:
+        mrope_section = rope_scaling.get("mrope_section",
+                                         None) if rope_scaling else None
+        # NOTE: We assume mrope_section is always available
+        # as Qwen2.5-VL is the only model using mrope
+        assert mrope_section is not None
+        split_indices = [mrope_section[0], mrope_section[0] + mrope_section[1]]
+        # Indices for the features to be rotated (first half of head_dim)
+        all_freq_indices = jnp.arange(head_dim // 2)
+        # Split the indices according to mrope_section. This is valid because split_indices are static.
+        freq_indices_split = jnp.split(all_freq_indices, split_indices)
+        # freq_indices_split is a list of 3 JAX arrays.
+        cos_list = []
+        sin_list = []
+        for i in range(3):  # For each of the 3 position dimensions
+            current_indices = freq_indices_split[i]
+            if current_indices.size == 0:
+                # This section is empty, skip.
+                continue
+            # inv_freq shape: (mrope_section[i],)
+            inv_freq = 1.0 / (rope_theta**(current_indices * 2.0 / head_dim))
+            # positions[i]: (seq_len,)
+            # freqs shape: (seq_len, mrope_section[i])
+            freqs = jnp.outer(positions[i], inv_freq)
+            cos_list.append(jnp.cos(freqs))
+            sin_list.append(jnp.sin(freqs))
+        # Concatenate along the feature dimension
+        # cos, sin shape: (seq_len, head_dim//2)
+        cos = jnp.concatenate(cos_list, axis=1)
+        sin = jnp.concatenate(sin_list, axis=1)
+        # Add num_heads dimension for broadcasting
+        cos = cos[:, jnp.newaxis, :]  # Shape: (seq_len, 1, head_dim//2)
+        sin = sin[:, jnp.newaxis, :]  # Shape: (seq_len, 1, head_dim//2)
+        # Apply rotation
+        inputs_real = inputs[..., :head_dim // 2]
+        inputs_imag = inputs[..., head_dim // 2:head_dim]
+        outputs_real = inputs_real * cos - inputs_imag * sin
+        outputs_imag = inputs_real * sin + inputs_imag * cos
+        out = jnp.concatenate([outputs_real, outputs_imag], axis=-1)
+    # Standard RoPE
+    else:
+        # Calculate inverse frequencies (timescale)
+        fraction = 2 * jnp.arange(0, head_dim // 2) / head_dim
+        timescale = 1.0 / (rope_theta**fraction)
+        # Apply scaling if provided
+        if rope_scaling:
+            timescale = apply_rope_scaling(timescale, rope_scaling)
+        # Prepare for rotation by calculating sin and cos values
+        # `sinusoid_inp` gets shape (batch * seq_len, head_dim/2)
+        sinusoid_inp = positions[..., jnp.newaxis] * timescale[jnp.newaxis, :]
+        # Broadcast over the 'heads' dimension, assuming shape (batch*seq, heads, head_dim)
+        sinusoid_inp = sinusoid_inp[:, jnp.newaxis, ...]
+        sin = jnp.sin(sinusoid_inp)
+        cos = jnp.cos(sinusoid_inp)
+        if rope_input_ordering == "interleaved":
+            # Reshape to group adjacent features for rotation, matching new_apply_rope
+            rotary_inputs = inputs[
+                ..., :head_dim]  # Take just the non-padded amount.
+            reshaped_inputs = rotary_inputs.reshape(*rotary_inputs.shape[:-1],
+                                                    -1, 2)
+            # Apply the rotation
+            first_half = reshaped_inputs[..., 0]
+            second_half = reshaped_inputs[..., 1]
+        else:
+            first_half = inputs[..., :head_dim // 2]
+            second_half = inputs[..., head_dim // 2:head_dim]
+        first_part = first_half * cos - second_half * sin
+        second_part = second_half * cos + first_half * sin
+        # Combine the rotated parts and reshape back
+        if rope_input_ordering == "interleaved":
+            out_stacked = jnp.stack([first_part, second_part], axis=-1)
+            out = out_stacked.reshape(rotary_inputs.shape)
+        else:
+            out = jnp.concatenate([first_part, second_part], axis=-1)
+    # If the original input was padded, pad the output with zeros to match.
+    padded_head_dim = inputs.shape[-1]
+    if padded_head_dim > head_dim:
+        pad_width = padded_head_dim - head_dim
+        pad_config = [(0, 0)] * (out.ndim - 1) + [(0, pad_width)]
+        out = jnp.pad(out, pad_config)
+    return out.astype(inputs.dtype)
+def apply_longrope(
+    inputs: jax.Array,
+    positions: jax.Array,
+    head_dim: int,
+    rope_scaling: Dict[str, Any],
+    original_max_position_embeddings: int,
+    max_position_embeddings: int,
+    rope_theta: float = 10000,
+) -> jax.Array:
+    # LongRoPE implementation specific to Phi-3
+    # Implementation based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/phi3/modeling_phi3.py#L197-L235
+    scale = max_position_embeddings / original_max_position_embeddings
+    if scale <= 1.0:
+        mscale = 1.0
+    else:
+        mscale = jnp.sqrt(1 + (jnp.log(scale) /
+                               jnp.log(original_max_position_embeddings)))
+    seq_len = inputs.shape[0]
+    if seq_len > original_max_position_embeddings:
+        long_factor = jnp.array(rope_scaling.get("long_factor"))
+        timescale = 1.0 / (long_factor * (rope_theta**(
+            (2 * jnp.arange(0, head_dim // 2)) / head_dim)))
+    else:
+        short_factor = jnp.array(rope_scaling.get("short_factor"))
+        timescale = 1.0 / (short_factor * (rope_theta**(
+            (2 * jnp.arange(0, head_dim // 2)) / head_dim)))
+    # Calculate RoPE positions
+    sinusoid_inp = positions[..., jnp.newaxis] * timescale[jnp.newaxis, :]
+    sinusoid_inp = sinusoid_inp[:, jnp.newaxis, ...]
+    sin = jnp.sin(sinusoid_inp) * mscale
+    cos = jnp.cos(sinusoid_inp) * mscale
+    # Padding logic
+    padded_head_dim = inputs.shape[-1]
+    # Apply RoPE mechanism
+    first_half = inputs[..., :head_dim // 2]
+    second_half = inputs[..., head_dim // 2:head_dim]
+    first_part = first_half * cos - second_half * sin
+    second_part = second_half * cos + first_half * sin
+    out = jnp.concatenate([first_part, second_part], axis=-1)
+    if padded_head_dim > head_dim:
+        out = jnp.pad(out, ((0, 0), (0, 0), (0, padded_head_dim - head_dim)))
+    return out.astype(inputs.dtype)
+def apply_rope_scaling(freqs: jax.Array, rope_scaling: Dict[str,
+                                                            Any]) -> jax.Array:
+    # Values obtained from grid search
+    scale_factor = rope_scaling.get("scale_factor", 8.0)
+    low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+    high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+    old_context_len = rope_scaling.get("original_max_position_embeddings",
+                                       8192)
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / freqs
+    smooth = (old_context_len / wavelen -
+              low_freq_factor) / (high_freq_factor - low_freq_factor)
+    high_freqs = jnp.where(wavelen < high_freq_wavelen, freqs, 0)
+    low_freqs = jnp.where(wavelen > low_freq_wavelen, freqs / scale_factor, 0)
+    mid_freqs = jnp.where(
+        (wavelen >= high_freq_wavelen) & (wavelen <= low_freq_wavelen),
+        (1 - smooth) * freqs / scale_factor + smooth * freqs,
+        0,
+    )
+    new_freqs = high_freqs + low_freqs + mid_freqs
+    return new_freqs

tpu_inference/layers/jax/sample/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.