tpu-inference 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tpu-inference might be problematic. Click here for more details.

Files changed (168) hide show
  1. tests/__init__.py +0 -0
  2. tests/core/__init__.py +0 -0
  3. tests/core/test_adapters.py +83 -0
  4. tests/core/test_core_tpu.py +523 -0
  5. tests/core/test_disagg_executor.py +60 -0
  6. tests/core/test_disagg_utils.py +53 -0
  7. tests/core/test_init.py +49 -0
  8. tests/kernels/__init__.py +0 -0
  9. tests/kernels/quantized_matmul_kernel_test.py +191 -0
  10. tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
  11. tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
  12. tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
  13. tests/lora/__init__.py +0 -0
  14. tests/lora/test_lora.py +123 -0
  15. tests/test_base.py +201 -0
  16. tests/test_quantization.py +836 -0
  17. tests/test_tpu_info.py +120 -0
  18. tests/test_utils.py +218 -0
  19. tests/tpu_backend_test.py +59 -0
  20. tpu_inference/__init__.py +30 -0
  21. tpu_inference/adapters/__init__.py +0 -0
  22. tpu_inference/adapters/vllm_adapters.py +42 -0
  23. tpu_inference/adapters/vllm_config_adapters.py +134 -0
  24. tpu_inference/backend.py +69 -0
  25. tpu_inference/core/__init__.py +0 -0
  26. tpu_inference/core/adapters.py +153 -0
  27. tpu_inference/core/core_tpu.py +776 -0
  28. tpu_inference/core/disagg_executor.py +117 -0
  29. tpu_inference/core/disagg_utils.py +51 -0
  30. tpu_inference/di/__init__.py +0 -0
  31. tpu_inference/di/abstracts.py +28 -0
  32. tpu_inference/di/host.py +76 -0
  33. tpu_inference/di/interfaces.py +51 -0
  34. tpu_inference/distributed/__init__.py +0 -0
  35. tpu_inference/distributed/tpu_connector.py +699 -0
  36. tpu_inference/distributed/utils.py +59 -0
  37. tpu_inference/executors/__init__.py +0 -0
  38. tpu_inference/executors/ray_distributed_executor.py +346 -0
  39. tpu_inference/experimental/__init__.py +0 -0
  40. tpu_inference/experimental/llama3_jax_stashed.py +258 -0
  41. tpu_inference/interfaces/__init__.py +0 -0
  42. tpu_inference/interfaces/cache.py +31 -0
  43. tpu_inference/interfaces/config.py +47 -0
  44. tpu_inference/interfaces/config_parts.py +117 -0
  45. tpu_inference/interfaces/engine.py +51 -0
  46. tpu_inference/interfaces/outputs.py +22 -0
  47. tpu_inference/interfaces/params.py +21 -0
  48. tpu_inference/interfaces/platform.py +74 -0
  49. tpu_inference/interfaces/request.py +39 -0
  50. tpu_inference/interfaces/scheduler.py +31 -0
  51. tpu_inference/kernels/__init__.py +0 -0
  52. tpu_inference/kernels/collectives/__init__.py +0 -0
  53. tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
  54. tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
  55. tpu_inference/kernels/collectives/util.py +47 -0
  56. tpu_inference/kernels/flash_attention/__init__.py +0 -0
  57. tpu_inference/kernels/flash_attention/kernel.py +772 -0
  58. tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
  59. tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
  60. tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
  61. tpu_inference/kernels/quantized_matmul/util.py +58 -0
  62. tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
  63. tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
  64. tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
  65. tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
  66. tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
  67. tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
  68. tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1447 -0
  69. tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3834 -0
  70. tpu_inference/kernels/ragged_paged_attention/v3/util.py +47 -0
  71. tpu_inference/layers/__init__.py +0 -0
  72. tpu_inference/layers/common/__init__.py +0 -0
  73. tpu_inference/layers/common/attention_metadata.py +34 -0
  74. tpu_inference/layers/jax/__init__.py +0 -0
  75. tpu_inference/layers/jax/attention/__init__.py +0 -0
  76. tpu_inference/layers/jax/attention/attention.py +254 -0
  77. tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
  78. tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
  79. tpu_inference/layers/jax/attention_interface.py +356 -0
  80. tpu_inference/layers/jax/base.py +151 -0
  81. tpu_inference/layers/jax/binary_search.py +295 -0
  82. tpu_inference/layers/jax/constants.py +88 -0
  83. tpu_inference/layers/jax/layers.py +301 -0
  84. tpu_inference/layers/jax/misc.py +16 -0
  85. tpu_inference/layers/jax/moe/__init__.py +0 -0
  86. tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
  87. tpu_inference/layers/jax/moe/moe.py +209 -0
  88. tpu_inference/layers/jax/rope.py +172 -0
  89. tpu_inference/layers/jax/rope_interface.py +214 -0
  90. tpu_inference/layers/jax/sample/__init__.py +0 -0
  91. tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
  92. tpu_inference/layers/jax/sample/sampling.py +95 -0
  93. tpu_inference/layers/jax/sample/sampling_metadata.py +69 -0
  94. tpu_inference/layers/jax/sharding.py +406 -0
  95. tpu_inference/layers/jax/transformer_block.py +76 -0
  96. tpu_inference/layers/vllm/__init__.py +0 -0
  97. tpu_inference/layers/vllm/attention.py +184 -0
  98. tpu_inference/layers/vllm/fused_moe.py +399 -0
  99. tpu_inference/layers/vllm/linear_common.py +186 -0
  100. tpu_inference/layers/vllm/quantization/__init__.py +34 -0
  101. tpu_inference/layers/vllm/quantization/awq.py +207 -0
  102. tpu_inference/layers/vllm/quantization/common.py +105 -0
  103. tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
  104. tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +121 -0
  105. tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
  106. tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
  107. tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
  108. tpu_inference/layers/vllm/quantization/unquantized.py +263 -0
  109. tpu_inference/layers/vllm/sharding.py +151 -0
  110. tpu_inference/logger.py +10 -0
  111. tpu_inference/lora/__init__.py +0 -0
  112. tpu_inference/lora/torch_lora_ops.py +103 -0
  113. tpu_inference/lora/torch_punica_tpu.py +308 -0
  114. tpu_inference/mock/__init__.py +0 -0
  115. tpu_inference/mock/vllm_config_utils.py +28 -0
  116. tpu_inference/mock/vllm_envs.py +1233 -0
  117. tpu_inference/mock/vllm_logger.py +212 -0
  118. tpu_inference/mock/vllm_logging_utils.py +15 -0
  119. tpu_inference/models/__init__.py +0 -0
  120. tpu_inference/models/common/__init__.py +0 -0
  121. tpu_inference/models/common/model_loader.py +433 -0
  122. tpu_inference/models/jax/__init__.py +0 -0
  123. tpu_inference/models/jax/deepseek_v3.py +868 -0
  124. tpu_inference/models/jax/llama3.py +366 -0
  125. tpu_inference/models/jax/llama4.py +473 -0
  126. tpu_inference/models/jax/llama_eagle3.py +333 -0
  127. tpu_inference/models/jax/phi3.py +376 -0
  128. tpu_inference/models/jax/qwen2.py +375 -0
  129. tpu_inference/models/jax/qwen2_5_vl.py +976 -0
  130. tpu_inference/models/jax/qwen3.py +302 -0
  131. tpu_inference/models/jax/utils/__init__.py +0 -0
  132. tpu_inference/models/jax/utils/file_utils.py +96 -0
  133. tpu_inference/models/jax/utils/multi_modal_utils.py +164 -0
  134. tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
  135. tpu_inference/models/jax/utils/quantization/quantization_utils.py +588 -0
  136. tpu_inference/models/jax/utils/weight_utils.py +510 -0
  137. tpu_inference/models/vllm/__init__.py +0 -0
  138. tpu_inference/models/vllm/vllm_model_wrapper.py +272 -0
  139. tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
  140. tpu_inference/platforms/__init__.py +2 -0
  141. tpu_inference/platforms/tpu_jax.py +257 -0
  142. tpu_inference/runner/__init__.py +0 -0
  143. tpu_inference/runner/block_table_jax.py +122 -0
  144. tpu_inference/runner/compilation_manager.py +672 -0
  145. tpu_inference/runner/input_batch_jax.py +435 -0
  146. tpu_inference/runner/kv_cache.py +119 -0
  147. tpu_inference/runner/kv_cache_manager.py +460 -0
  148. tpu_inference/runner/lora_utils.py +92 -0
  149. tpu_inference/runner/multimodal_manager.py +208 -0
  150. tpu_inference/runner/persistent_batch_manager.py +244 -0
  151. tpu_inference/runner/speculative_decoding_manager.py +250 -0
  152. tpu_inference/runner/structured_decoding_manager.py +89 -0
  153. tpu_inference/runner/tpu_jax_runner.py +771 -0
  154. tpu_inference/runner/utils.py +426 -0
  155. tpu_inference/spec_decode/__init__.py +0 -0
  156. tpu_inference/spec_decode/jax/__init__.py +0 -0
  157. tpu_inference/spec_decode/jax/eagle3.py +334 -0
  158. tpu_inference/tpu_info.py +77 -0
  159. tpu_inference/utils.py +294 -0
  160. tpu_inference/worker/__init__.py +0 -0
  161. tpu_inference/worker/_temporary_vllm_compat.py +129 -0
  162. tpu_inference/worker/base.py +100 -0
  163. tpu_inference/worker/tpu_worker_jax.py +321 -0
  164. tpu_inference-0.11.1.dist-info/METADATA +101 -0
  165. tpu_inference-0.11.1.dist-info/RECORD +168 -0
  166. tpu_inference-0.11.1.dist-info/WHEEL +5 -0
  167. tpu_inference-0.11.1.dist-info/licenses/LICENSE +201 -0
  168. tpu_inference-0.11.1.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1447 @@
1
+ """TPU-Friendly Ragged Paged Attention kernel.
2
+
3
+ This kernel offers a highly optimized implementation of ragged paged attention,
4
+ specifically designed for TPU and compatible with a wide range of model
5
+ specifications. It supports mixed prefill and decoding, enhancing throughput
6
+ during inference.
7
+ """
8
+ import functools
9
+
10
+ import jax
11
+ import jax.numpy as jnp
12
+ from jax import lax
13
+ from jax.experimental import pallas as pl
14
+ from jax.experimental.pallas import tpu as pltpu
15
+
16
+ from tpu_inference.kernels.ragged_paged_attention.v3.tuned_block_sizes import \
17
+ get_tuned_block_sizes
18
+ from tpu_inference.kernels.ragged_paged_attention.v3.util import (
19
+ align_to, cdiv, get_dtype_packing)
20
+
21
+ DEFAULT_MASK_VALUE = -0.7 * float(jnp.finfo(jnp.dtype("float32")).max)
22
+
23
+ DEFAULT_VMEM_LIMIT_BYTES = 100 * 1024 * 1024
24
+
25
+
26
+ def ref_ragged_paged_attention(
27
+ queries: jax.
28
+ Array, # [max_num_tokens, actual_num_q_heads, actual_head_dim]
29
+ keys: jax.Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim]
30
+ values: jax.
31
+ Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim]
32
+ kv_cache: jax.
33
+ Array, # [total_num_pages, page_size, num_kv_heads_x2 // kv_packing, kv_packing, head_dim]
34
+ kv_lens: jax.Array, # i32[max_num_seqs]
35
+ page_indices: jax.Array, # i32[max_num_seqs * pages_per_seq]
36
+ cu_q_lens: jax.Array, # i32[max_num_seqs + 1]
37
+ distribution: jax.Array, # i32[3]
38
+ *,
39
+ sm_scale: float = 1.0,
40
+ sliding_window: int | None = None,
41
+ soft_cap: float | None = None,
42
+ mask_value: float | None = DEFAULT_MASK_VALUE,
43
+ q_scale: float | None = None,
44
+ k_scale: float | None = None,
45
+ v_scale: float | None = None,
46
+ ):
47
+ if mask_value is None:
48
+ mask_value = DEFAULT_MASK_VALUE
49
+
50
+ dynamic_validate_inputs(
51
+ queries,
52
+ keys,
53
+ values,
54
+ kv_cache,
55
+ kv_lens,
56
+ page_indices,
57
+ cu_q_lens,
58
+ distribution,
59
+ sm_scale=sm_scale,
60
+ sliding_window=sliding_window,
61
+ soft_cap=soft_cap,
62
+ mask_value=mask_value,
63
+ q_scale=q_scale,
64
+ k_scale=k_scale,
65
+ v_scale=v_scale,
66
+ )
67
+ actual_head_dim = queries.shape[2]
68
+ actual_num_q_heads = queries.shape[1]
69
+ actual_num_kv_heads = keys.shape[1]
70
+ merged_kv = merge_kv(keys, values)
71
+ assert merged_kv.shape[-3:] == kv_cache.shape[-3:]
72
+
73
+ _, page_size, num_kv_heads_x2_per_kv_packing, kv_packing, head_dim = (
74
+ kv_cache.shape)
75
+ num_kv_heads_x2 = num_kv_heads_x2_per_kv_packing * kv_packing
76
+ assert num_kv_heads_x2 % 2 == 0
77
+ assert actual_num_q_heads % actual_num_kv_heads == 0
78
+ assert head_dim % 128 == 0
79
+ assert get_dtype_packing(kv_cache.dtype) == kv_packing
80
+ assert num_kv_heads_x2 == align_to(actual_num_kv_heads * 2, kv_packing)
81
+ actual_num_q_heads_per_kv_head = actual_num_q_heads // actual_num_kv_heads
82
+ max_num_seqs = kv_lens.shape[0]
83
+ num_page_indices = page_indices.shape[0]
84
+ assert num_page_indices % max_num_seqs == 0
85
+ pages_per_seq = num_page_indices // max_num_seqs
86
+ outputs = []
87
+
88
+ for i in range(distribution[-1]):
89
+ q_start = cu_q_lens[i]
90
+ q_end = cu_q_lens[i + 1]
91
+ q_len = q_end - q_start
92
+
93
+ kv_len = kv_lens[i]
94
+ indices_start = i * pages_per_seq
95
+ indices_end = indices_start + cdiv(kv_len, page_size)
96
+ indices = page_indices[indices_start:indices_end]
97
+ q = queries[q_start:q_end, :, :actual_head_dim]
98
+
99
+ # Update the kv cache.
100
+ assert kv_len - q_len >= 0
101
+ gathered_kv = kv_cache[indices]
102
+ gathered_shape = gathered_kv.shape
103
+ gathered_kv = gathered_kv.reshape(-1, *gathered_shape[-3:])
104
+ gathered_kv = gathered_kv.at[kv_len - q_len:kv_len].set(
105
+ merged_kv[q_start:q_end])
106
+ kv_cache = kv_cache.at[indices].set(
107
+ gathered_kv.reshape(gathered_shape))
108
+
109
+ kv = gathered_kv.reshape(
110
+ -1, num_kv_heads_x2,
111
+ head_dim)[:, :actual_num_kv_heads * 2, :].reshape(
112
+ -1, actual_num_kv_heads, head_dim * 2)
113
+ k = kv[:kv_len, :, :head_dim][:, :, :actual_head_dim]
114
+ v = kv[:kv_len, :, head_dim:][:, :, :actual_head_dim]
115
+ k = jnp.repeat(k, actual_num_q_heads_per_kv_head, axis=1)
116
+ v = jnp.repeat(v, actual_num_q_heads_per_kv_head, axis=1)
117
+
118
+ if q_scale is not None:
119
+ q = (q / q_scale)
120
+ if jnp.issubdtype(k.dtype, jnp.floating):
121
+ dtype_info = jnp.finfo(k.dtype)
122
+ minval = float(dtype_info.min)
123
+ maxval = float(dtype_info.max)
124
+ q = jnp.clip(q, min=minval, max=maxval)
125
+ q = q.astype(k.dtype)
126
+
127
+ attn = jnp.einsum("qhd,khd->hqk",
128
+ q,
129
+ k,
130
+ preferred_element_type=jnp.float32)
131
+ attn *= sm_scale
132
+ if k_scale is not None:
133
+ attn *= k_scale
134
+ if q_scale is not None:
135
+ attn *= q_scale
136
+
137
+ q_span = (kv_len - q_len) + jax.lax.broadcasted_iota(
138
+ jnp.int32, attn.shape, 1)
139
+ kv_span = jax.lax.broadcasted_iota(jnp.int32, attn.shape, 2)
140
+ mask = q_span < kv_span
141
+ if sliding_window is not None:
142
+ mask = jnp.logical_or(mask, q_span - sliding_window >= kv_span)
143
+ if soft_cap is not None:
144
+ attn = soft_cap * jnp.tanh(attn / soft_cap)
145
+ attn += jnp.where(mask, mask_value, 0.0)
146
+ attn = jax.nn.softmax(attn, axis=-1).astype(v.dtype)
147
+
148
+ out = jnp.einsum("hqk,khd->qhd", attn, v).astype(queries.dtype)
149
+ if v_scale is not None:
150
+ out *= v_scale
151
+
152
+ outputs.append(out)
153
+
154
+ result = jnp.concatenate(outputs, axis=0)
155
+ return result, kv_cache
156
+
157
+
158
+ def get_smem_estimate_bytes(max_num_seqs, pages_per_seq):
159
+ total_bits = (
160
+ # kv_lens_ref: i32[max_num_seqs]
161
+ align_to(max_num_seqs, 128) * 32 +
162
+ # page_indices_ref: i32[max_num_seqs * pages_per_seq]
163
+ align_to(max_num_seqs * pages_per_seq, 128) * 32 +
164
+ # cu_q_lens_ref: i32[max_num_seqs + 1]
165
+ align_to(max_num_seqs + 1, 128) * 32 +
166
+ # distribution_ref: i32[3]
167
+ 128 * 32 +
168
+ # sem_ids_ref: i32[3]
169
+ 128 * 32 +
170
+ # bo_ids_ref: i32[4]
171
+ 128 * 32 +
172
+ # bkv_update_ids_ref: i32[6]
173
+ 128 * 32)
174
+ return cdiv(total_bits, 8)
175
+
176
+
177
+ def get_vmem_estimate_bytes(
178
+ actual_num_kv_heads,
179
+ actual_num_q_heads_per_kv_head,
180
+ actual_head_dim,
181
+ bq_sz,
182
+ bkv_sz,
183
+ q_dtype,
184
+ kv_dtype,
185
+ ):
186
+ q_packing = get_dtype_packing(q_dtype)
187
+ kv_packing = get_dtype_packing(kv_dtype)
188
+ num_q_heads_per_kv_head = align_to(actual_num_q_heads_per_kv_head,
189
+ q_packing)
190
+ num_kv_heads_x2 = align_to(actual_num_kv_heads * 2, kv_packing)
191
+ head_dim = align_to(actual_head_dim, 128)
192
+
193
+ total_bits = (
194
+ # bkv_x2_ref
195
+ (2 * bkv_sz * num_kv_heads_x2 * head_dim) * (32 // kv_packing) +
196
+ # bq_x2_ref + bo_x2_ref
197
+ 2 * (2 * actual_num_kv_heads * bq_sz * num_q_heads_per_kv_head *
198
+ head_dim) * (32 // q_packing) +
199
+ # l_ref + m_ref
200
+ 2 *
201
+ (actual_num_kv_heads * bq_sz * num_q_heads_per_kv_head * 128) * 32 +
202
+ # acc_ref
203
+ (actual_num_kv_heads * bq_sz * num_q_heads_per_kv_head * head_dim) *
204
+ 32)
205
+ return cdiv(total_bits, 8)
206
+
207
+
208
+ def get_kv_cache_shape(
209
+ total_num_pages,
210
+ page_size,
211
+ actual_num_kv_heads,
212
+ actual_head_dim,
213
+ kv_dtype,
214
+ ):
215
+ kv_packing = get_dtype_packing(kv_dtype)
216
+ return (
217
+ total_num_pages,
218
+ page_size,
219
+ align_to(actual_num_kv_heads * 2, kv_packing) // kv_packing,
220
+ kv_packing,
221
+ align_to(actual_head_dim, 128),
222
+ )
223
+
224
+
225
+ def _ragged_paged_attention_kernel(
226
+ # Prefetch
227
+ kv_lens_ref, # [max_num_seqs]
228
+ page_indices_ref, # [max_num_seqs * pages_per_seq]
229
+ cu_q_lens_ref, # [max_num_seqs + 1]
230
+ # TODO(jevinjiang): merge these into one so we can save SMEM.
231
+ distribution_ref, # [3] (decode_end, prefill_end, mixed_end)
232
+ sem_ids_ref, # [3] (bq_sem_idx, bkv_sem_idx, bo_sem_idx)
233
+ bo_ids_ref, # [4] (bo_sem_0_seq_idx, bo_sem_1_seq_idx, bo_sem_0_bo_idx, bo_sem_1_bo_idx)
234
+ bkv_update_ids_ref, # [6] (bkv_sem_0_seq_idx, bkv_sem_1_seq_idx, bkv_sem_0_offset, bkv_sem_1_offset, bkv_sem_0_sz, bkv_sem_1_sz)
235
+ # Input
236
+ q_hbm_ref, # [actual_num_kv_heads, max_num_tokens, num_q_heads_per_kv_head // q_packing, q_packing, head_dim]
237
+ kv_hbm_ref, # [max_num_tokens, num_kv_heads_x2 // kv_packing, kv_packing, head_dim]
238
+ kv_cache_hbm_ref, # [total_num_pages, page_size, num_kv_heads_x2 // kv_packing, kv_packing, head_dim]
239
+ # Output
240
+ o_hbm_ref, # [actual_num_kv_heads, max_num_tokens, num_q_heads_per_kv_head // q_packing, q_packing, head_dim]
241
+ updated_kv_cache_hbm_ref, # [total_num_pages, page_size, num_kv_heads_x2 // kv_packing, kv_packing, head_dim]
242
+ # Scratch
243
+ bkv_x2_ref, # [2, bkv_sz, num_kv_heads_x2 // kv_packing, kv_packing, head_dim]
244
+ bq_x2_ref, # [2, actual_num_kv_heads, bq_sz, num_q_heads_per_kv_head // q_packing, q_packing, head_dim]
245
+ bo_x2_ref, # [2, actual_num_kv_heads, bq_sz, num_q_heads_per_kv_head // q_packing, q_packing, head_dim]
246
+ sems, # [4, 2]
247
+ l_ref, # [actual_num_kv_heads, bq_sz * num_q_heads_per_kv_head, 128],
248
+ m_ref, # [actual_num_kv_heads, bq_sz * num_q_heads_per_kv_head, 128],
249
+ acc_ref, # [actual_num_kv_heads, bq_sz * num_q_heads_per_kv_head, head_dim],
250
+ *,
251
+ sm_scale: float,
252
+ sliding_window: int | None = None,
253
+ soft_cap: float | None = None,
254
+ mask_value: float = DEFAULT_MASK_VALUE,
255
+ q_scale: float | None = None,
256
+ k_scale: float | None = None,
257
+ v_scale: float | None = None,
258
+ chunk_prefill_size: int | None = None,
259
+ bkv_p,
260
+ bq_sz,
261
+ debug_mode: bool = False,
262
+ ):
263
+ assert q_hbm_ref.shape == o_hbm_ref.shape
264
+ assert q_hbm_ref.shape[-1] == kv_cache_hbm_ref.shape[-1]
265
+ (
266
+ actual_num_kv_heads,
267
+ max_num_tokens,
268
+ num_q_heads_per_kv_head_per_packing,
269
+ q_packing,
270
+ head_dim,
271
+ ) = q_hbm_ref.shape
272
+ (
273
+ total_num_pages,
274
+ page_size,
275
+ num_kv_heads_x2_per_kv_packing,
276
+ kv_packing,
277
+ _,
278
+ ) = kv_cache_hbm_ref.shape
279
+ max_num_seqs = kv_lens_ref.shape[0]
280
+ num_page_indices = page_indices_ref.shape[0]
281
+ assert num_page_indices % max_num_seqs == 0
282
+ pages_per_seq = num_page_indices // max_num_seqs
283
+ num_kv_heads_x2 = num_kv_heads_x2_per_kv_packing * kv_packing
284
+ num_q_heads_per_kv_head = num_q_heads_per_kv_head_per_packing * q_packing
285
+ q_dtype = q_hbm_ref.dtype
286
+ kv_dtype = kv_cache_hbm_ref.dtype
287
+ assert o_hbm_ref.dtype == q_dtype
288
+ assert get_dtype_packing(q_dtype) == q_packing
289
+ assert get_dtype_packing(kv_dtype) == kv_packing
290
+ assert head_dim % 128 == 0
291
+ bkv_sz = bkv_p * page_size
292
+ seq_idx = pl.program_id(0)
293
+ num_seqs = pl.num_programs(0)
294
+ decode_end = distribution_ref[0]
295
+ prefill_end = distribution_ref[1]
296
+ mixed_end = distribution_ref[2]
297
+
298
+ q_start = cu_q_lens_ref[seq_idx]
299
+ q_end = cu_q_lens_ref[seq_idx + 1]
300
+ q_len = q_end - q_start
301
+ kv_len = kv_lens_ref[seq_idx]
302
+
303
+ def debug_print(msg, *args):
304
+ if debug_mode:
305
+ pl.debug_print(msg, *args)
306
+
307
+ debug_print("[RPA debug] ======= In loop seq_idx={}", seq_idx)
308
+ debug_print("[RPA debug] num_seqs={}", num_seqs)
309
+ debug_print("[RPA debug] decode_end={}", decode_end)
310
+ debug_print("[RPA debug] prefill_end={}", prefill_end)
311
+ debug_print("[RPA debug] mixed_end={}", mixed_end)
312
+ debug_print("[RPA debug] bkv_p={}", bkv_p)
313
+ debug_print("[RPA debug] page_size={}", page_size)
314
+ debug_print("[RPA debug] pages_per_seq={}", pages_per_seq)
315
+ debug_print("[RPA debug] bkv_sz={}", bkv_sz)
316
+ debug_print("[RPA debug] bq_sz={}", bq_sz)
317
+ debug_print("[RPA debug] q_start={}", q_start)
318
+ debug_print("[RPA debug] q_end={}", q_end)
319
+ debug_print("[RPA debug] q_len={}", q_len)
320
+ debug_print("[RPA debug] kv_len={}", kv_len)
321
+
322
+ def flash_attention(
323
+ q, # [actual_bq_sz * num_q_heads_per_kv_head, head_dim]
324
+ k, # [bkv_sz, head_dim]
325
+ v, # [bkv_sz, head_dim]
326
+ *,
327
+ bq_idx,
328
+ bkv_idx,
329
+ kv_head_idx,
330
+ ):
331
+ assert len(q.shape) == 2
332
+ assert q.shape[0] % num_q_heads_per_kv_head == 0
333
+ assert q.shape[1] == head_dim
334
+ assert k.shape == v.shape == (bkv_sz, head_dim)
335
+ assert k.dtype == v.dtype
336
+ head_l_ref = l_ref.at[kv_head_idx, :q.shape[0]]
337
+ head_m_ref = m_ref.at[kv_head_idx, :q.shape[0]]
338
+ head_acc_ref = acc_ref.at[kv_head_idx, :q.shape[0]]
339
+
340
+ def load_with_init(ref, init_val):
341
+ return jnp.where(bkv_idx == 0, jnp.full_like(ref, init_val),
342
+ ref[...])
343
+
344
+ # Follow FlashAttention-2 forward pass.
345
+ if q_scale is not None:
346
+ q = (q / q_scale)
347
+ if jnp.issubdtype(k.dtype, jnp.floating):
348
+ dtype_info = jnp.finfo(k.dtype)
349
+ minval = float(dtype_info.min)
350
+ maxval = float(dtype_info.max)
351
+ q = jnp.clip(q, min=minval, max=maxval)
352
+ q = q.astype(k.dtype)
353
+
354
+ s = jnp.einsum("nd,md->nm", q, k, preferred_element_type=jnp.float32)
355
+ s *= sm_scale
356
+ if k_scale is not None:
357
+ s *= k_scale
358
+ if q_scale is not None:
359
+ s *= q_scale
360
+
361
+ q_span = (kv_len - q_len + bq_idx * bq_sz +
362
+ lax.broadcasted_iota(jnp.int32, s.shape, 0) //
363
+ num_q_heads_per_kv_head)
364
+ k_span = bkv_idx * bkv_sz + lax.broadcasted_iota(jnp.int32, s.shape, 1)
365
+ mask = q_span < k_span
366
+ # TODO(jevinjiang, xiowei): reduce pages_per_seq based on sliding_window.
367
+ if sliding_window is not None:
368
+ mask = jnp.logical_or(mask, q_span - sliding_window >= k_span)
369
+
370
+ if soft_cap is not None:
371
+ s = soft_cap * jnp.tanh(s / soft_cap)
372
+ s += jnp.where(mask, mask_value, 0.0)
373
+ s_rowmax = jnp.max(s, axis=1, keepdims=True)
374
+ m_prev = load_with_init(head_m_ref, -jnp.inf)
375
+ m_curr = jnp.maximum(m_prev, s_rowmax)
376
+ head_m_ref[...] = m_curr
377
+ p = jnp.exp(s - broadcast_minor(m_curr, s.shape))
378
+
379
+ pv = jnp.einsum("nm,md->nd", p, v, preferred_element_type=jnp.float32)
380
+ if v_scale is not None:
381
+ pv *= v_scale
382
+
383
+ p_rowsum = jnp.sum(p, axis=1, keepdims=True)
384
+ exp_m_diff = jnp.exp(m_prev - m_curr)
385
+ l_prev = load_with_init(head_l_ref, 0.0)
386
+ l_curr = exp_m_diff * l_prev + p_rowsum
387
+ head_l_ref[...] = l_curr
388
+ o_prev = load_with_init(head_acc_ref, 0.0)
389
+ o_curr = broadcast_minor(exp_m_diff, o_prev.shape) * o_prev + pv
390
+ head_acc_ref[...] = o_curr
391
+
392
+ def _async_copy(src, dst, sem, wait):
393
+ if debug_mode:
394
+ # Skip DMA if debug mode is enabled.
395
+ return
396
+ cp = pltpu.make_async_copy(src, dst, sem)
397
+ if wait:
398
+ cp.wait()
399
+ else:
400
+ cp.start()
401
+
402
+ def _fetch_bkv(seq_idx, bkv_idx, bkv_sem_idx, *, wait=False):
403
+ sem = sems.at[0, bkv_sem_idx]
404
+ vmem_ref = bkv_x2_ref.at[bkv_sem_idx]
405
+
406
+ cache_hbm_shape = kv_cache_hbm_ref.shape
407
+ cache_hbm_ref = kv_cache_hbm_ref.reshape(
408
+ cache_hbm_shape[0] * cache_hbm_shape[1], *cache_hbm_shape[2:])
409
+ kv_len = kv_lens_ref[seq_idx]
410
+ kv_len_start = bkv_idx * bkv_sz
411
+ kv_p_start = bkv_idx * bkv_p
412
+ q_start = cu_q_lens_ref[seq_idx]
413
+ q_end = cu_q_lens_ref[seq_idx + 1]
414
+ q_len = q_end - q_start
415
+
416
+ kv_left = kv_len - kv_len_start
417
+ kv_left_frm_cache = jnp.maximum(kv_left - q_len, 0)
418
+ kv_left_frm_new = kv_left - kv_left_frm_cache
419
+ bkv_p_frm_cache = jnp.minimum(cdiv(kv_left_frm_cache, page_size),
420
+ bkv_p)
421
+ bkv_sz_frm_new = jnp.minimum(
422
+ jnp.maximum(bkv_sz - kv_left_frm_cache, 0), kv_left_frm_new)
423
+ page_indices_offset = seq_idx * pages_per_seq + kv_p_start
424
+
425
+ # Make sure the current bkv buffer is safe to overwrite.
426
+ wait_update_kv_cache(bkv_sem_idx)
427
+
428
+ debug_print(
429
+ "[RPA debug]"
430
+ f" -----------{'wait' if wait else 'start'}_fetch_bkv-----------")
431
+ debug_print("[RPA debug] seq_idx={}", seq_idx)
432
+ debug_print("[RPA debug] bkv_idx={}", bkv_idx)
433
+ debug_print("[RPA debug] bkv_sem_idx={}", bkv_sem_idx)
434
+ debug_print("[RPA debug] kv_len_start={}", kv_len_start)
435
+ debug_print("[RPA debug] kv_p_start={}", kv_p_start)
436
+ debug_print("[RPA debug] kv_left={}", kv_left)
437
+ debug_print("[RPA debug] kv_left_frm_cache={}", kv_left_frm_cache)
438
+ debug_print("[RPA debug] kv_left_frm_new={}", kv_left_frm_new)
439
+ debug_print("[RPA debug] bkv_p_frm_cache={}", bkv_p_frm_cache)
440
+ debug_print("[RPA debug] bkv_sz_frm_new={}", bkv_sz_frm_new)
441
+ debug_print("[RPA debug] page_indices_offset={}", page_indices_offset)
442
+
443
+ # Fetch effective kv from kv cache.
444
+ def loop_body(i, offset):
445
+ sz = jnp.minimum(page_size, kv_left_frm_cache - i * page_size)
446
+ _async_copy(
447
+ cache_hbm_ref.at[pl.ds(
448
+ page_indices_ref[page_indices_offset + i] * page_size,
449
+ sz)],
450
+ vmem_ref.at[pl.ds(i * page_size, sz)],
451
+ sem,
452
+ wait,
453
+ )
454
+ debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
455
+ return offset + sz
456
+
457
+ offset = lax.fori_loop(
458
+ 0,
459
+ bkv_p_frm_cache,
460
+ loop_body,
461
+ 0, # offset
462
+ unroll=False,
463
+ )
464
+
465
+ # Fetch kv directly from new kv.
466
+ @pl.when(bkv_sz_frm_new > 0)
467
+ def _fetch_bkv_from_new_kv():
468
+ new_kv_len_start = q_end - kv_left_frm_new
469
+ debug_print("[RPA debug] new_kv_len_start={}", new_kv_len_start)
470
+ debug_print("[RPA debug] offset_in_bkv={}", offset)
471
+ _async_copy(
472
+ kv_hbm_ref.at[pl.ds(new_kv_len_start, bkv_sz_frm_new)],
473
+ vmem_ref.at[pl.ds(offset, bkv_sz_frm_new)],
474
+ sem,
475
+ wait,
476
+ )
477
+
478
+ return kv_len_start + offset, bkv_sz_frm_new
479
+
480
+ def _update_kv_cache(seq_idx,
481
+ bkv_sem_idx,
482
+ offset,
483
+ update_sz,
484
+ *,
485
+ wait=False):
486
+ sem = sems.at[3, bkv_sem_idx]
487
+ vmem_ref = bkv_x2_ref.at[bkv_sem_idx]
488
+ bkv_id = offset // bkv_sz
489
+ kv_p_start = offset // page_size
490
+ kv_p_end = cdiv(offset + update_sz, page_size)
491
+ ignore = offset % page_size
492
+ p_ignore = kv_p_start - bkv_id * bkv_p
493
+ page_indices_offset = seq_idx * pages_per_seq + kv_p_start
494
+
495
+ cache_hbm_shape = updated_kv_cache_hbm_ref.shape
496
+ cache_hbm_ref = updated_kv_cache_hbm_ref.reshape(
497
+ cache_hbm_shape[0] * cache_hbm_shape[1], *cache_hbm_shape[2:])
498
+
499
+ debug_print(
500
+ "[RPA debug]"
501
+ f" -----------{'wait' if wait else 'start'}_update_kv_cache-----------"
502
+ )
503
+ debug_print("[RPA debug] seq_idx={}", seq_idx)
504
+ debug_print("[RPA debug] bkv_sem_idx={}", bkv_sem_idx)
505
+ debug_print("[RPA debug] offset={}", offset)
506
+ debug_print("[RPA debug] update_sz={}", update_sz)
507
+ debug_print("[RPA debug] bkv_id={}", bkv_id)
508
+ debug_print("[RPA debug] kv_p_start={}", kv_p_start)
509
+ debug_print("[RPA debug] kv_p_end={}", kv_p_end)
510
+ debug_print("[RPA debug] ignore={}", ignore)
511
+ debug_print("[RPA debug] p_ignore={}", p_ignore)
512
+ debug_print("[RPA debug] page_indices_offset={}", page_indices_offset)
513
+
514
+ def loop_body(i, states):
515
+ update_sz, ignore = states
516
+ sz = jnp.minimum(page_size - ignore, update_sz)
517
+
518
+ _async_copy(
519
+ vmem_ref.at[pl.ds((p_ignore + i) * page_size + ignore, sz)],
520
+ cache_hbm_ref.at[pl.ds(
521
+ page_indices_ref[page_indices_offset + i] * page_size +
522
+ ignore,
523
+ sz,
524
+ )],
525
+ sem,
526
+ wait,
527
+ )
528
+ debug_print("[RPA debug] loop_body i={}, sz={}", i, sz)
529
+ return update_sz - sz, 0
530
+
531
+ lax.fori_loop(
532
+ 0,
533
+ kv_p_end - kv_p_start,
534
+ loop_body,
535
+ (update_sz, ignore), # total transfer size
536
+ unroll=False,
537
+ )
538
+
539
+ def _fetch_bq(seq_idx, bq_idx, bq_sem_idx, *, wait=False):
540
+ sem = sems.at[1, bq_sem_idx]
541
+ vmem_ref = bq_x2_ref.at[bq_sem_idx]
542
+ q_len_start = cu_q_lens_ref[seq_idx] + bq_idx * bq_sz
543
+ q_end = cu_q_lens_ref[seq_idx + 1]
544
+ sz = jnp.minimum(bq_sz, q_end - q_len_start)
545
+
546
+ debug_print(
547
+ "[RPA debug]"
548
+ f" -----------{'wait' if wait else 'start'}_fetch_bq-----------")
549
+ debug_print("[RPA debug] seq_idx={}", seq_idx)
550
+ debug_print("[RPA debug] bq_idx={}", bq_idx)
551
+ debug_print("[RPA debug] bq_sem_idx={}", bq_sem_idx)
552
+ debug_print("[RPA debug] q_len_start={}", q_len_start)
553
+ debug_print("[RPA debug] q_end={}", q_end)
554
+ debug_print("[RPA debug] sz={}", sz)
555
+
556
+ _async_copy(
557
+ q_hbm_ref.at[:, pl.ds(q_len_start, sz)],
558
+ vmem_ref.at[:, pl.ds(0, sz)],
559
+ sem,
560
+ wait,
561
+ )
562
+
563
+ def _send_bo(seq_idx, bo_idx, bo_sem_idx, *, wait=False):
564
+ sem = sems.at[2, bo_sem_idx]
565
+ vmem_ref = bo_x2_ref.at[bo_sem_idx]
566
+ q_len_start = cu_q_lens_ref[seq_idx] + bo_idx * bq_sz
567
+ q_end = cu_q_lens_ref[seq_idx + 1]
568
+ sz = jnp.minimum(bq_sz, q_end - q_len_start)
569
+
570
+ debug_print(
571
+ "[RPA debug]"
572
+ f" -----------{'wait' if wait else 'start'}_send_bo-----------")
573
+ debug_print("[RPA debug] seq_idx={}", seq_idx)
574
+ debug_print("[RPA debug] bo_idx={}", bo_idx)
575
+ debug_print("[RPA debug] bo_sem_idx={}", bo_sem_idx)
576
+ debug_print("[RPA debug] q_len_start={}", q_len_start)
577
+ debug_print("[RPA debug] q_end={}", q_end)
578
+ debug_print("[RPA debug] sz={}", sz)
579
+
580
+ _async_copy(
581
+ vmem_ref.at[:, pl.ds(0, sz)],
582
+ o_hbm_ref.at[:, pl.ds(q_len_start, sz)],
583
+ sem,
584
+ wait,
585
+ )
586
+
587
+ def start_fetch_bkv(seq_idx, bkv_idx, bkv_sem_idx):
588
+ return _fetch_bkv(seq_idx, bkv_idx, bkv_sem_idx)
589
+
590
+ def wait_fetch_bkv(seq_idx, bkv_idx, bkv_sem_idx):
591
+ return _fetch_bkv(seq_idx, bkv_idx, bkv_sem_idx, wait=True)
592
+
593
+ def start_fetch_bq(seq_idx, bq_idx, bq_sem_idx):
594
+ return _fetch_bq(seq_idx, bq_idx, bq_sem_idx)
595
+
596
+ def wait_fetch_bq(seq_idx, bq_idx, bq_sem_idx):
597
+ return _fetch_bq(seq_idx, bq_idx, bq_sem_idx, wait=True)
598
+
599
+ def start_send_bo(seq_idx, bo_idx, bo_sem_idx):
600
+ bo_ids_ref[bo_sem_idx] = seq_idx
601
+ bo_ids_ref[bo_sem_idx + 2] = bo_idx
602
+ _send_bo(seq_idx, bo_idx, bo_sem_idx)
603
+
604
+ def wait_send_bo(bo_sem_idx):
605
+ old_seq_idx = bo_ids_ref[bo_sem_idx]
606
+ old_bo_idx = bo_ids_ref[bo_sem_idx + 2]
607
+
608
+ @pl.when(jnp.logical_and(0 <= old_seq_idx, old_seq_idx <= seq_idx))
609
+ def _():
610
+ _send_bo(old_seq_idx, old_bo_idx, bo_sem_idx, wait=True)
611
+
612
+ def start_update_kv_cache(seq_idx, bkv_sem_idx, offset, update_sz):
613
+ bkv_update_ids_ref[bkv_sem_idx] = seq_idx
614
+ bkv_update_ids_ref[bkv_sem_idx + 2] = offset
615
+ bkv_update_ids_ref[bkv_sem_idx + 4] = update_sz
616
+ _update_kv_cache(seq_idx, bkv_sem_idx, offset, update_sz)
617
+
618
+ def wait_update_kv_cache(bkv_sem_idx):
619
+ update_sz = bkv_update_ids_ref[bkv_sem_idx + 4]
620
+
621
+ @pl.when(update_sz > 0)
622
+ def _():
623
+ seq_idx = bkv_update_ids_ref[bkv_sem_idx]
624
+ offset = bkv_update_ids_ref[bkv_sem_idx + 2]
625
+ bkv_update_ids_ref[bkv_sem_idx + 4] = 0
626
+ _update_kv_cache(seq_idx,
627
+ bkv_sem_idx,
628
+ offset,
629
+ update_sz,
630
+ wait=True)
631
+
632
+ def load_bq(bq_sem_idx, kv_head_idx, *, actual_bq_sz=bq_sz):
633
+ q_ref = (bq_x2_ref.bitcast(
634
+ jnp.uint32).at[bq_sem_idx, kv_head_idx].reshape(
635
+ bq_sz * num_q_heads_per_kv_head_per_packing, head_dim))
636
+ return pltpu.bitcast(
637
+ q_ref[:actual_bq_sz * num_q_heads_per_kv_head_per_packing],
638
+ q_dtype)
639
+
640
+ def strided_load(ref, start, step, *, dtype=None):
641
+ assert get_dtype_packing(ref.dtype) == 1
642
+ assert len(ref.shape) == 2
643
+ r, l = ref.shape # noqa
644
+ assert l % 128 == 0
645
+ folds = l // 128
646
+ ref = ref.reshape(r * folds, 128)
647
+ start *= folds
648
+ step *= folds
649
+ vec = jnp.concat([ref[start + i::step] for i in range(folds)], axis=1)
650
+ if dtype is not None:
651
+ vec = pltpu.bitcast(vec, dtype)
652
+ return vec
653
+
654
+ def strided_load_bkv(bkv_sem_idx, start, step, *, bkv_bitmask):
655
+ assert start % kv_packing == 0
656
+ assert step % kv_packing == 0
657
+ start //= kv_packing
658
+ step //= kv_packing
659
+ kv_ref = (bkv_x2_ref.bitcast(jnp.uint32).at[bkv_sem_idx].reshape(
660
+ bkv_sz * step, head_dim))
661
+
662
+ def _mask_kv(k, v):
663
+ k = pltpu.bitcast(k, jnp.uint32)
664
+ v = pltpu.bitcast(v, jnp.uint32)
665
+ k = k & bkv_bitmask
666
+ v = v & bkv_bitmask
667
+ k = pltpu.bitcast(k, kv_dtype)
668
+ v = pltpu.bitcast(v, kv_dtype)
669
+ return (k, v)
670
+
671
+ if kv_packing == 1:
672
+ k = strided_load(kv_ref, start, step, dtype=kv_dtype)
673
+ v = strided_load(kv_ref, start + 1, step, dtype=kv_dtype)
674
+ return [_mask_kv(k, v)]
675
+
676
+ kv = strided_load(kv_ref, start, step)
677
+ bitwidth = 32 // kv_packing
678
+ repack_ty = jnp.dtype(f"uint{bitwidth}")
679
+ lst = []
680
+ for i in range(0, kv_packing, 2):
681
+ k = (kv >> (i * bitwidth)).astype(repack_ty)
682
+ v = (kv >> ((i + 1) * bitwidth)).astype(repack_ty)
683
+ lst.append(_mask_kv(k, v))
684
+ return lst
685
+
686
+ def broadcast_minor(src, shape):
687
+ if src.shape == shape:
688
+ return src
689
+ assert src.shape[:-1] == shape[:-1]
690
+ assert src.shape[-1] % 128 == 0
691
+ target_minor = align_to(shape[-1], src.shape[-1])
692
+ # no-op concatenation.
693
+ return jnp.concatenate(
694
+ [src for _ in range(target_minor // src.shape[-1])],
695
+ axis=-1)[..., :shape[-1]]
696
+
697
+ def process(static_q_len=None):
698
+ num_bkv = cdiv(kv_len, bkv_sz)
699
+ if static_q_len is None:
700
+ actual_bq_sz = bq_sz
701
+ num_bq = cdiv(q_len, actual_bq_sz)
702
+ else:
703
+ actual_bq_sz = min(bq_sz, static_q_len)
704
+ num_bq = cdiv(static_q_len, actual_bq_sz)
705
+
706
+ def get_next_bq_ids(seq_idx, bq_idx, bq_sem_idx):
707
+ next_bq_idx = bq_idx + 1
708
+ is_last_bq = next_bq_idx == num_bq
709
+ next_bq_idx = lax.select(is_last_bq, 0, next_bq_idx)
710
+ next_seq_idx = lax.select(is_last_bq, seq_idx + 1, seq_idx)
711
+ next_bq_sem_idx = lax.select(bq_sem_idx == 0, 1, 0)
712
+ return next_seq_idx, next_bq_idx, next_bq_sem_idx
713
+
714
+ def get_next_bkv_ids(seq_idx, bq_idx, bkv_idx, bkv_sem_idx):
715
+ next_bkv_idx = bkv_idx + 1
716
+ is_last_bkv = next_bkv_idx == num_bkv
717
+ next_bkv_idx = lax.select(is_last_bkv, 0, next_bkv_idx)
718
+ next_bq_idx = lax.select(is_last_bkv, bq_idx + 1, bq_idx)
719
+ is_last_bq = next_bq_idx == num_bq
720
+ next_bq_idx = lax.select(is_last_bq, 0, next_bq_idx)
721
+ next_seq_idx = lax.select(is_last_bq, seq_idx + 1, seq_idx)
722
+ next_bkv_sem_idx = lax.select(bkv_sem_idx == 0, 1, 0)
723
+ return next_seq_idx, next_bq_idx, next_bkv_idx, next_bkv_sem_idx
724
+
725
+ def compute_with_bq(bq_idx, _):
726
+ bq_sem_idx = sem_ids_ref[0]
727
+ next_seq_idx, next_bq_idx, next_bq_sem_idx = get_next_bq_ids(
728
+ seq_idx, bq_idx, bq_sem_idx)
729
+
730
+ # Prefetch next bq
731
+ @pl.when(next_seq_idx < num_seqs)
732
+ def prefetch_next_bq():
733
+ sem_ids_ref[0] = next_bq_sem_idx
734
+ start_fetch_bq(next_seq_idx, next_bq_idx, next_bq_sem_idx)
735
+
736
+ def compute_with_bkv(bkv_idx, _):
737
+ # Create bitmask for KV.
738
+ assert bkv_sz % kv_packing == 0
739
+ actual_bkv_sz = jnp.minimum(bkv_sz, kv_len - bkv_idx * bkv_sz)
740
+ bkv_shape = (bkv_sz, head_dim)
741
+ bkv_mask = lax.broadcasted_iota(jnp.int32, bkv_shape,
742
+ 0) < actual_bkv_sz
743
+ bkv_bitmask = pltpu.bitcast(
744
+ lax.select(
745
+ bkv_mask,
746
+ jnp.full(bkv_shape, 0xFFFFFFFF, dtype=jnp.uint32),
747
+ jnp.full(bkv_shape, 0, dtype=jnp.uint32),
748
+ ).astype(jnp.dtype(f"uint{32 // kv_packing}")),
749
+ jnp.uint32,
750
+ )
751
+
752
+ # Get next bkv ids.
753
+ bkv_sem_idx = sem_ids_ref[1]
754
+ next_seq_idx, _, next_bkv_idx, next_bkv_sem_idx = get_next_bkv_ids(
755
+ seq_idx, bq_idx, bkv_idx, bkv_sem_idx)
756
+
757
+ # Prefetch next bkv
758
+ @pl.when(next_seq_idx < num_seqs)
759
+ def prefetch_next_bkv():
760
+ sem_ids_ref[1] = next_bkv_sem_idx
761
+ start_fetch_bkv(next_seq_idx, next_bkv_idx,
762
+ next_bkv_sem_idx)
763
+
764
+ # Wait for cur bq if not ready yet
765
+ @pl.when(bkv_idx == 0)
766
+ def wait_cur_bq():
767
+ wait_fetch_bq(seq_idx, bq_idx, bq_sem_idx)
768
+
769
+ # Wait for cur bkv
770
+ offset, update_sz = wait_fetch_bkv(seq_idx, bkv_idx,
771
+ bkv_sem_idx)
772
+
773
+ # Start updating bkv to kv cache if applicable.
774
+ # Only needed in first bq loop.
775
+ @pl.when(jnp.logical_and(update_sz > 0, bq_idx == 0))
776
+ def update_cur_bkv_to_cache():
777
+ start_update_kv_cache(seq_idx, bkv_sem_idx, offset,
778
+ update_sz)
779
+
780
+ debug_print(
781
+ "[RPA debug] -----------flash attention-----------")
782
+ debug_print("[RPA debug] seq_idx={}", seq_idx)
783
+ debug_print("[RPA debug] bq_idx={}", bq_idx)
784
+ debug_print("[RPA debug] bkv_idx={}", bkv_idx)
785
+ if debug_mode:
786
+ # Skip flash attention if debug mode is enabled.
787
+ return
788
+
789
+ # Flash attention with cur bkv and bq
790
+ # NOTE: kv_packing is divided by 2 because k and v are packed together.
791
+ heads_per_load = max(1, kv_packing // 2)
792
+ for kv_head_start in range(0, actual_num_kv_heads,
793
+ heads_per_load):
794
+ bkv_lst = strided_load_bkv(
795
+ bkv_sem_idx,
796
+ kv_head_start * 2,
797
+ num_kv_heads_x2,
798
+ bkv_bitmask=bkv_bitmask,
799
+ )
800
+ assert len(bkv_lst) == heads_per_load
801
+ for i in range(heads_per_load):
802
+ kv_head_idx = kv_head_start + i
803
+ if kv_head_idx >= actual_num_kv_heads:
804
+ break
805
+ bq = load_bq(bq_sem_idx,
806
+ kv_head_idx,
807
+ actual_bq_sz=actual_bq_sz)
808
+ bk, bv = bkv_lst[i]
809
+ flash_attention(
810
+ bq,
811
+ bk,
812
+ bv,
813
+ bq_idx=bq_idx,
814
+ bkv_idx=bkv_idx,
815
+ kv_head_idx=kv_head_idx,
816
+ )
817
+
818
+ lax.fori_loop(0, num_bkv, compute_with_bkv, None, unroll=False)
819
+
820
+ # Load acc and calculate final output.
821
+ acc = acc_ref[...]
822
+ l = broadcast_minor(l_ref[...], acc.shape) # noqa
823
+ out = (lax.div(acc, l) if q_dtype == jnp.float32 else
824
+ (acc * pl.reciprocal(l, approx=True)).astype(q_dtype))
825
+
826
+ # Wait for previous bo to be fully sent before storing new bo.
827
+ bo_sem_idx = sem_ids_ref[2]
828
+ sem_ids_ref[2] = lax.select(bo_sem_idx == 0, 1, 0)
829
+ wait_send_bo(bo_sem_idx)
830
+
831
+ # Store output from acc to bo.
832
+ bo_x2_ref.at[bo_sem_idx].bitcast(jnp.int32).reshape(
833
+ actual_num_kv_heads,
834
+ bq_sz * num_q_heads_per_kv_head_per_packing,
835
+ head_dim,
836
+ )[...] = pltpu.bitcast(out, jnp.int32)
837
+
838
+ # Send cur bo
839
+ start_send_bo(seq_idx, bq_idx, bo_sem_idx)
840
+
841
+ lax.fori_loop(0, num_bq, compute_with_bq, None, unroll=False)
842
+
843
+ ### ------- Kernel start ------- ###
844
+
845
+ @pl.when(seq_idx == 0)
846
+ def prologue():
847
+ start_fetch_bq(0, 0, 0)
848
+ start_fetch_bkv(0, 0, 0)
849
+
850
+ @pl.when(seq_idx < decode_end)
851
+ def process_decode():
852
+ process(static_q_len=1)
853
+
854
+ @pl.when(jnp.logical_and(decode_end <= seq_idx, seq_idx < prefill_end))
855
+ def process_prefill():
856
+ process(static_q_len=chunk_prefill_size)
857
+
858
+ @pl.when(jnp.logical_and(prefill_end <= seq_idx, seq_idx < mixed_end))
859
+ def process_mixed():
860
+ process()
861
+
862
+ @pl.when(seq_idx == num_seqs - 1)
863
+ def epilogue():
864
+ for i in range(2):
865
+ wait_send_bo(i)
866
+ wait_update_kv_cache(i)
867
+
868
+ ### ------- Kernel end ------- ###
869
+
870
+
871
+ def merge_kv(
872
+ k: jax.
873
+ Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim],
874
+ v: jax.
875
+ Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim],
876
+ ):
877
+ assert k.shape == v.shape
878
+ assert k.dtype == v.dtype
879
+ max_num_tokens, actual_num_kv_heads, actual_head_dim = k.shape
880
+ kv_packing = get_dtype_packing(k.dtype)
881
+ actual_num_kv_heads_x2 = actual_num_kv_heads * 2
882
+ num_kv_heads_x2 = align_to(actual_num_kv_heads_x2, kv_packing)
883
+ head_dim = align_to(actual_head_dim, 128)
884
+ kv = jnp.pad(
885
+ jnp.concat([k, v],
886
+ axis=-1).reshape(max_num_tokens, actual_num_kv_heads_x2,
887
+ actual_head_dim),
888
+ (
889
+ (0, 0),
890
+ (0, num_kv_heads_x2 - actual_num_kv_heads_x2),
891
+ (0, head_dim - actual_head_dim),
892
+ ),
893
+ constant_values=0,
894
+ ).reshape(
895
+ max_num_tokens,
896
+ num_kv_heads_x2 // kv_packing,
897
+ kv_packing,
898
+ head_dim,
899
+ )
900
+ return kv
901
+
902
+
903
+ def prepare_inputs(
904
+ q: jax.Array, # [max_num_tokens, actual_num_q_heads, actual_head_dim],
905
+ k: jax.
906
+ Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim],
907
+ v: jax.
908
+ Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim],
909
+ ):
910
+ max_num_tokens, actual_num_q_heads, actual_head_dim = q.shape
911
+ actual_num_kv_heads = k.shape[1]
912
+ assert actual_num_q_heads % actual_num_kv_heads == 0
913
+ actual_num_q_heads_per_kv_head = actual_num_q_heads // actual_num_kv_heads
914
+ q_packing = get_dtype_packing(q.dtype)
915
+ num_q_heads_per_kv_head = align_to(actual_num_q_heads_per_kv_head,
916
+ q_packing)
917
+ head_dim = align_to(actual_head_dim, 128)
918
+ q = (
919
+ jnp.pad(
920
+ q.reshape(
921
+ max_num_tokens,
922
+ actual_num_kv_heads,
923
+ actual_num_q_heads_per_kv_head,
924
+ actual_head_dim,
925
+ ),
926
+ (
927
+ (0, 0),
928
+ (0, 0),
929
+ (0, num_q_heads_per_kv_head - actual_num_q_heads_per_kv_head),
930
+ (0, head_dim - actual_head_dim),
931
+ ),
932
+ constant_values=0,
933
+ ).reshape(
934
+ max_num_tokens,
935
+ actual_num_kv_heads,
936
+ num_q_heads_per_kv_head // q_packing,
937
+ q_packing,
938
+ head_dim,
939
+ )
940
+ # TODO(jevinjiang): Explore fusing swapping non-tiling axis to DMA.
941
+ .swapaxes(0, 1))
942
+ # TODO(kyuyeunk, chengjiyao): Add kv quantization here.
943
+ kv = merge_kv(k, v)
944
+ return q, kv
945
+
946
+
947
+ def prepare_outputs(
948
+ out, # [actual_num_kv_heads, max_num_tokens, num_q_heads_per_kv_head // q_packing, q_packing, head_dim]
949
+ actual_num_q_heads_per_kv_head: int,
950
+ actual_head_dim: int,
951
+ ):
952
+ (
953
+ actual_num_kv_heads,
954
+ max_num_tokens,
955
+ num_q_heads_per_kv_head_per_q_packing,
956
+ q_packing,
957
+ head_dim,
958
+ ) = out.shape
959
+ actual_num_q_heads = actual_num_q_heads_per_kv_head * actual_num_kv_heads
960
+ return (out.swapaxes(0, 1).reshape(
961
+ max_num_tokens,
962
+ actual_num_kv_heads,
963
+ num_q_heads_per_kv_head_per_q_packing * q_packing,
964
+ head_dim,
965
+ )[:, :, :actual_num_q_heads_per_kv_head, :actual_head_dim].reshape(
966
+ max_num_tokens, actual_num_q_heads, actual_head_dim))
967
+
968
+
969
+ # Expect to run this validation during runtime.
970
+ def dynamic_validate_inputs(
971
+ queries: jax.
972
+ Array, # [max_num_tokens, actual_num_q_heads, actual_head_dim]
973
+ keys: jax.Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim]
974
+ values: jax.
975
+ Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim]
976
+ kv_cache: jax.
977
+ Array, # [total_num_pages, page_size, num_kv_heads_x2 // kv_packing, kv_packing, head_dim]
978
+ kv_lens: jax.Array, # i32[max_num_seqs]
979
+ page_indices: jax.Array, # i32[max_num_seqs * pages_per_seq]
980
+ cu_q_lens: jax.Array, # i32[max_num_seqs + 1]
981
+ distribution: jax.Array, # i32[3]
982
+ *,
983
+ sm_scale: float = 1.0,
984
+ sliding_window: int | None = None,
985
+ soft_cap: float | None = None,
986
+ mask_value: float | None = DEFAULT_MASK_VALUE,
987
+ q_scale: float | None = None,
988
+ k_scale: float | None = None,
989
+ v_scale: float | None = None,
990
+ # Kernel optimization params.
991
+ chunk_prefill_size: int | None = None,
992
+ # Kernel tuning params.
993
+ num_kv_pages_per_block: int | None = None,
994
+ num_queries_per_block: int | None = None,
995
+ vmem_limit_bytes: int | None = None,
996
+ # Debug params.
997
+ debug_mode: bool = False,
998
+ ):
999
+ q, k, v = queries, keys, values
1000
+ static_validate_inputs(
1001
+ q,
1002
+ k,
1003
+ v,
1004
+ kv_cache,
1005
+ kv_lens,
1006
+ page_indices,
1007
+ cu_q_lens,
1008
+ distribution,
1009
+ sm_scale=sm_scale,
1010
+ sliding_window=sliding_window,
1011
+ soft_cap=soft_cap,
1012
+ mask_value=mask_value,
1013
+ q_scale=q_scale,
1014
+ k_scale=k_scale,
1015
+ v_scale=v_scale,
1016
+ chunk_prefill_size=chunk_prefill_size,
1017
+ num_kv_pages_per_block=num_kv_pages_per_block,
1018
+ num_queries_per_block=num_queries_per_block,
1019
+ vmem_limit_bytes=vmem_limit_bytes,
1020
+ debug_mode=debug_mode,
1021
+ )
1022
+ max_num_tokens = q.shape[0]
1023
+ total_num_pages = kv_cache.shape[0]
1024
+ page_size = kv_cache.shape[1]
1025
+ max_num_seqs = kv_lens.shape[0]
1026
+ num_page_indices = page_indices.shape[0]
1027
+ assert num_page_indices % max_num_seqs == 0
1028
+ pages_per_seq = num_page_indices // max_num_seqs
1029
+
1030
+ i, j, k = distribution
1031
+ if not (i <= j <= k):
1032
+ raise ValueError(f"Invalid distribution: {distribution=}")
1033
+
1034
+ if k > max_num_seqs:
1035
+ raise ValueError(f"num_seqs={k} must be <= {max_num_seqs=}")
1036
+
1037
+ if cu_q_lens[k] > max_num_tokens:
1038
+ raise ValueError(
1039
+ f"Total q tokens {cu_q_lens[k]} must be <= {max_num_tokens=}.")
1040
+ for i in range(k):
1041
+ q_len = cu_q_lens[i + 1] - cu_q_lens[i]
1042
+ kv_len = kv_lens[i]
1043
+ if not (0 < q_len <= kv_len):
1044
+ raise ValueError(
1045
+ f"Require 0 < {q_len=} <= {kv_len=} at sequence {i}.")
1046
+ page_cnt = cdiv(kv_len, page_size)
1047
+ if page_cnt > pages_per_seq:
1048
+ raise ValueError(
1049
+ f"Require {page_cnt=} <= {pages_per_seq=} at sequence {i} where"
1050
+ f" {kv_len=} and {page_size=}.")
1051
+ for p in range(page_cnt):
1052
+ page_idx = page_indices[i * pages_per_seq + p]
1053
+ if not (0 <= page_idx < total_num_pages):
1054
+ raise ValueError(
1055
+ f"Require 0 <= {page_idx=} < {total_num_pages=} at sequence"
1056
+ f" {i} where {kv_len=} and {page_size=}.")
1057
+
1058
+
1059
+ # Expect to run this validation during compile time.
1060
+ def static_validate_inputs(
1061
+ queries: jax.
1062
+ Array, # [max_num_tokens, actual_num_q_heads, actual_head_dim]
1063
+ keys: jax.Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim]
1064
+ values: jax.
1065
+ Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim]
1066
+ kv_cache: jax.
1067
+ Array, # [total_num_pages, page_size, num_kv_heads_x2 // kv_packing, kv_packing, head_dim]
1068
+ kv_lens: jax.Array, # i32[max_num_seqs]
1069
+ page_indices: jax.Array, # i32[max_num_seqs * pages_per_seq]
1070
+ cu_q_lens: jax.Array, # i32[max_num_seqs + 1]
1071
+ distribution: jax.Array, # i32[3]
1072
+ *,
1073
+ sm_scale: float = 1.0,
1074
+ sliding_window: int | None = None,
1075
+ soft_cap: float | None = None,
1076
+ mask_value: float | None = DEFAULT_MASK_VALUE,
1077
+ q_scale: float | None = None,
1078
+ k_scale: float | None = None,
1079
+ v_scale: float | None = None,
1080
+ # Kernel optimization params.
1081
+ chunk_prefill_size: int | None = None,
1082
+ # Kernel tuning params.
1083
+ num_kv_pages_per_block: int | None = None,
1084
+ num_queries_per_block: int | None = None,
1085
+ vmem_limit_bytes: int | None = None,
1086
+ # Debug params.
1087
+ debug_mode: bool = False,
1088
+ ):
1089
+ """Validate inputs to the RPA kernel statically."""
1090
+ q, k, v = queries, keys, values
1091
+ if not (len(q.shape) == len(k.shape) == len(v.shape) == 3):
1092
+ raise ValueError(
1093
+ f"Expected 2D array for {q.shape=}, {k.shape=}, {v.shape=}")
1094
+ if k.shape != v.shape:
1095
+ raise ValueError(f"Expected {k.shape=} to be equal to {v.shape=}")
1096
+ if not (q.shape[0] == k.shape[0] == v.shape[0]):
1097
+ raise ValueError(
1098
+ f"Expected {q.shape[0]=} to be equal to {k.shape[0]=} and {v.shape[0]=}"
1099
+ )
1100
+ if not (q.shape[2] == k.shape[2] == v.shape[2]):
1101
+ raise ValueError(
1102
+ f"Expected {q.shape[2]=} to be equal to {k.shape[2]=} and {v.shape[2]=}"
1103
+ )
1104
+
1105
+ actual_head_dim = q.shape[2]
1106
+ actual_num_q_heads = q.shape[1]
1107
+ actual_num_kv_heads = k.shape[1]
1108
+
1109
+ if actual_num_q_heads % actual_num_kv_heads != 0:
1110
+ raise ValueError(f"Expected {actual_num_q_heads=} to be divisible by"
1111
+ f" {actual_num_kv_heads=}.")
1112
+
1113
+ (
1114
+ _,
1115
+ page_size,
1116
+ num_kv_heads_x2_per_kv_packing,
1117
+ kv_packing,
1118
+ head_dim,
1119
+ ) = kv_cache.shape
1120
+
1121
+ if head_dim != align_to(actual_head_dim, 128):
1122
+ raise ValueError(
1123
+ f"Expected {head_dim=} is equal to {align_to(actual_head_dim, 128)=}"
1124
+ )
1125
+ # Note: we expect the kv quantization happens outside of the RPA kernel.
1126
+ if not (kv_cache.dtype == k.dtype == v.dtype):
1127
+ raise ValueError(
1128
+ f"Expected {kv_cache.dtype=} to be equal to {k.dtype=} and {v.dtype=}."
1129
+ )
1130
+ # Integer kv quantization is currently not supported.
1131
+ if not jnp.issubdtype(kv_cache.dtype, jnp.floating):
1132
+ raise ValueError(f"Expected {kv_cache.dtype=} to be a floating point.")
1133
+ if kv_packing != get_dtype_packing(kv_cache.dtype):
1134
+ raise ValueError(
1135
+ f"{kv_packing=} does not match with {kv_cache.dtype=}")
1136
+
1137
+ num_kv_heads_x2 = num_kv_heads_x2_per_kv_packing * kv_packing
1138
+ if num_kv_heads_x2 % 2 != 0:
1139
+ raise ValueError(
1140
+ f"Combined KV heads must be divisible by 2, but got {num_kv_heads_x2}"
1141
+ )
1142
+ if align_to(actual_num_kv_heads * 2, kv_packing) != num_kv_heads_x2:
1143
+ raise ValueError(
1144
+ f"Invalid {num_kv_heads_x2=}, {actual_num_kv_heads=}, {kv_packing=}"
1145
+ )
1146
+
1147
+ if not (jnp.int32 == kv_lens.dtype == page_indices.dtype == cu_q_lens.dtype
1148
+ == distribution.dtype):
1149
+ raise ValueError(
1150
+ f"Expected int32 dtype for {kv_lens.dtype=}, {page_indices.dtype=},"
1151
+ f" {cu_q_lens.dtype=}, {distribution.dtype=}")
1152
+
1153
+ if not (len(kv_lens.shape) == len(page_indices.shape) == len(
1154
+ cu_q_lens.shape) == 1):
1155
+ raise ValueError(
1156
+ f"Expected 1D array for {kv_lens.shape=}, {page_indices.shape=},"
1157
+ f" {cu_q_lens.shape=}")
1158
+
1159
+ max_num_seqs = kv_lens.shape[0]
1160
+ num_page_indices = page_indices.shape[0]
1161
+ if num_page_indices % max_num_seqs != 0:
1162
+ raise ValueError(
1163
+ f"Expected {num_page_indices=} to be divisible by {max_num_seqs=}."
1164
+ )
1165
+ if cu_q_lens.shape != (max_num_seqs + 1, ):
1166
+ raise ValueError(
1167
+ f"Expected {cu_q_lens.shape=} to be ({max_num_seqs + 1},).")
1168
+ if distribution.shape != (3, ):
1169
+ raise ValueError(f"Expected {distribution.shape=} to be (3,).")
1170
+
1171
+ if page_size % kv_packing != 0:
1172
+ raise ValueError(f"{page_size=} must be divisible by {kv_packing=}.")
1173
+ if sliding_window is not None and sliding_window <= 0:
1174
+ raise ValueError(f"{sliding_window=} must be positive.")
1175
+ if soft_cap is not None and soft_cap == 0.0:
1176
+ raise ValueError(f"{soft_cap=} must not be 0.0.")
1177
+ if chunk_prefill_size is not None and chunk_prefill_size <= 0:
1178
+ raise ValueError(f"{chunk_prefill_size=} must be positive.")
1179
+ if num_kv_pages_per_block is not None:
1180
+ if num_kv_pages_per_block <= 0:
1181
+ raise ValueError(f"{num_kv_pages_per_block=} must be positive.")
1182
+ if num_queries_per_block is not None:
1183
+ if num_queries_per_block <= 0:
1184
+ raise ValueError(f"{num_queries_per_block=} must be positive.")
1185
+ if vmem_limit_bytes is not None and vmem_limit_bytes <= 0:
1186
+ raise ValueError(f"{vmem_limit_bytes=} must be positive.")
1187
+
1188
+ # No constraints for the following inputs.
1189
+ del sm_scale
1190
+ del mask_value
1191
+ del q_scale
1192
+ del k_scale
1193
+ del v_scale
1194
+ del debug_mode
1195
+
1196
+
1197
+ @functools.partial(
1198
+ jax.jit,
1199
+ static_argnames=(
1200
+ "sm_scale",
1201
+ "sliding_window",
1202
+ "soft_cap",
1203
+ "mask_value",
1204
+ "q_scale",
1205
+ "k_scale",
1206
+ "v_scale",
1207
+ "chunk_prefill_size",
1208
+ "num_kv_pages_per_block",
1209
+ "num_queries_per_block",
1210
+ "vmem_limit_bytes",
1211
+ "debug_mode",
1212
+ ),
1213
+ donate_argnames=("kv_cache", ),
1214
+ )
1215
+ def ragged_paged_attention(
1216
+ queries: jax.
1217
+ Array, # [max_num_tokens, actual_num_q_heads, actual_head_dim]
1218
+ keys: jax.Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim]
1219
+ values: jax.
1220
+ Array, # [max_num_tokens, actual_num_kv_heads, actual_head_dim]
1221
+ kv_cache: jax.
1222
+ Array, # [total_num_pages, page_size, num_kv_heads_x2 // kv_packing, kv_packing, head_dim]
1223
+ kv_lens: jax.Array, # i32[max_num_seqs]
1224
+ page_indices: jax.Array, # i32[max_num_seqs * pages_per_seq]
1225
+ cu_q_lens: jax.Array, # i32[max_num_seqs + 1]
1226
+ distribution: jax.Array, # i32[3]
1227
+ *,
1228
+ sm_scale: float = 1.0,
1229
+ sliding_window: int | None = None,
1230
+ soft_cap: float | None = None,
1231
+ mask_value: float | None = DEFAULT_MASK_VALUE,
1232
+ q_scale: float | None = None,
1233
+ k_scale: float | None = None,
1234
+ v_scale: float | None = None,
1235
+ # Kernel optimization params.
1236
+ chunk_prefill_size: int | None = None,
1237
+ # Kernel tuning params.
1238
+ num_kv_pages_per_block: int | None = None,
1239
+ num_queries_per_block: int | None = None,
1240
+ vmem_limit_bytes: int | None = None,
1241
+ # Debug params.
1242
+ debug_mode: bool = False,
1243
+ ):
1244
+ """Ragged paged attention that supports mixed prefill and decode.
1245
+
1246
+ Args:
1247
+ queries: concatenated all sequences' queries.
1248
+ keys: concatenated all sequences' keys (quantized).
1249
+ values: concatenated all sequences' values (quantized).
1250
+ kv_cache: paged KV cache with TPU-friendly shape.
1251
+ kv_lens: padded kv lengths. Only the first num_seqs values are valid.
1252
+ page_indices: flattened page indices look-up table by (seq_id, page_id).
1253
+ cu_q_lens: the cumulative sum of the effective query lengths. Similar to
1254
+ kv_lens, only the first num_seqs+1 values are valid.
1255
+ distribution: (i, j, k) represents that sequences[0:i] are decode-only,
1256
+ sequences[i:j] are chunked-prefill-only, and sequences[j:k] are mixed. The
1257
+ k is also the total number of sequences.
1258
+ actual_head_dim: the actual head size of the attention. Here we assume k and
1259
+ v have the same actual head size.
1260
+ sm_scale: the softmax scale which will be applied to the Q@K^T.
1261
+ sliding_window: the sliding window size for the attention.
1262
+ soft_cap: the logit soft cap for the attention.
1263
+ mask_value: mask value for causal mask.
1264
+ k_scale: the scale for the key cache.
1265
+ v_scale: the scale for the value cache.
1266
+ num_kv_pages_per_block: number of kv pages to be processed in one flash
1267
+ attention block in the pallas kernel.
1268
+ num_queries_per_block: number of kv pages to be processed in one flash
1269
+ attention block in the pallas kernel.
1270
+ vmem_limit_bytes: the vmem limit for the pallas kernel.
1271
+ debug_mode: if true, RPA does not issue any DMAs or run flash attention but
1272
+ print debug info. Need to compile with `--xla_tpu_enable_log_recorder`.
1273
+
1274
+ Returns:
1275
+ The output of the attention.
1276
+ """
1277
+ q, k, v = queries, keys, values
1278
+ static_validate_inputs(
1279
+ q,
1280
+ k,
1281
+ v,
1282
+ kv_cache,
1283
+ kv_lens,
1284
+ page_indices,
1285
+ cu_q_lens,
1286
+ distribution,
1287
+ sm_scale=sm_scale,
1288
+ sliding_window=sliding_window,
1289
+ soft_cap=soft_cap,
1290
+ mask_value=mask_value,
1291
+ q_scale=q_scale,
1292
+ k_scale=k_scale,
1293
+ v_scale=v_scale,
1294
+ chunk_prefill_size=chunk_prefill_size,
1295
+ num_kv_pages_per_block=num_kv_pages_per_block,
1296
+ num_queries_per_block=num_queries_per_block,
1297
+ vmem_limit_bytes=vmem_limit_bytes,
1298
+ )
1299
+
1300
+ actual_num_q_heads = q.shape[1]
1301
+ actual_head_dim = q.shape[2]
1302
+ actual_num_kv_heads = k.shape[1]
1303
+
1304
+ actual_num_q_heads_per_kv_head = actual_num_q_heads // actual_num_kv_heads
1305
+ q, kv = prepare_inputs(q, k, v)
1306
+ (
1307
+ _,
1308
+ max_num_tokens,
1309
+ num_q_heads_per_kv_head_per_q_packing,
1310
+ q_packing,
1311
+ head_dim,
1312
+ ) = q.shape
1313
+ page_size = kv_cache.shape[1]
1314
+ max_num_seqs = kv_lens.shape[0]
1315
+ num_page_indices = page_indices.shape[0]
1316
+ assert num_page_indices % max_num_seqs == 0
1317
+ pages_per_seq = num_page_indices // max_num_seqs
1318
+ num_q_heads_per_kv_head = num_q_heads_per_kv_head_per_q_packing * q_packing
1319
+
1320
+ bkv_p = num_kv_pages_per_block
1321
+ bq_sz = num_queries_per_block
1322
+ if bq_sz is None or bkv_p is None:
1323
+ bkv_p, bq_sz = get_tuned_block_sizes(
1324
+ q.dtype,
1325
+ kv_cache.dtype,
1326
+ actual_num_q_heads,
1327
+ actual_num_kv_heads,
1328
+ head_dim,
1329
+ page_size,
1330
+ max_num_tokens,
1331
+ pages_per_seq,
1332
+ )
1333
+ bkv_sz = bkv_p * page_size
1334
+ if vmem_limit_bytes is None:
1335
+ # TODO (jevinjiang/jacobplatin): change this to use
1336
+ # `get_vmem_estimate_bytes` when VREG spilling is fixed.
1337
+ vmem_limit_bytes = DEFAULT_VMEM_LIMIT_BYTES
1338
+ grid = (distribution[2], )
1339
+
1340
+ in_specs = [
1341
+ pl.BlockSpec(memory_space=pltpu.HBM),
1342
+ pl.BlockSpec(memory_space=pltpu.HBM),
1343
+ pl.BlockSpec(memory_space=pltpu.HBM),
1344
+ ]
1345
+
1346
+ out_specs = [
1347
+ pl.BlockSpec(memory_space=pltpu.HBM),
1348
+ pl.BlockSpec(memory_space=pltpu.HBM),
1349
+ ]
1350
+
1351
+ bkv_double_buf = pltpu.VMEM(
1352
+ (2, bkv_sz, *kv_cache.shape[2:]),
1353
+ kv_cache.dtype,
1354
+ )
1355
+
1356
+ bq_double_buf = pltpu.VMEM(
1357
+ (2, actual_num_kv_heads, bq_sz, *q.shape[2:]),
1358
+ q.dtype,
1359
+ )
1360
+
1361
+ bo_double_buf = bq_double_buf
1362
+
1363
+ l_scratch = pltpu.VMEM(
1364
+ (actual_num_kv_heads, bq_sz * num_q_heads_per_kv_head, 128),
1365
+ jnp.float32,
1366
+ )
1367
+ m_scratch = l_scratch
1368
+
1369
+ acc_scratch = pltpu.VMEM(
1370
+ (actual_num_kv_heads, bq_sz * num_q_heads_per_kv_head, head_dim),
1371
+ jnp.float32,
1372
+ )
1373
+
1374
+ scratch_shapes = [
1375
+ bkv_double_buf, # Double buffering for kv block.
1376
+ bq_double_buf, # Double buffering for q block.
1377
+ bo_double_buf, # Double buffering for output block.
1378
+ # Semaphores for double buffering of bkv, bq, bo and bkv_update.
1379
+ pltpu.SemaphoreType.DMA((4, 2)),
1380
+ # Intermediate buffers per kv head for flash attention.
1381
+ l_scratch,
1382
+ m_scratch,
1383
+ acc_scratch,
1384
+ ]
1385
+
1386
+ scalar_prefetches = (
1387
+ kv_lens,
1388
+ # TODO(jevinjiang): can we use ragged page_indices to save some smem?
1389
+ page_indices,
1390
+ cu_q_lens,
1391
+ distribution,
1392
+ # (bq_sem_idx, bkv_sem_idx, bo_sem_idx)
1393
+ jnp.zeros((3, ), jnp.int32),
1394
+ # (bo_sem_0_seq_idx, bo_sem_1_seq_idx, bo_sem_0_bo_idx, bo_sem_1_bo_idx)
1395
+ jnp.full((4, ), -1, jnp.int32),
1396
+ # (bkv_sem_0_seq_idx, bkv_sem_1_seq_idx, bkv_sem_0_offset, bkv_sem_1_offset, bkv_sem_0_sz, bkv_sem_1_sz)
1397
+ jnp.full((6, ), -1, jnp.int32),
1398
+ )
1399
+
1400
+ scope_name = f"RPA-bq_{bq_sz}-bkvp_{bkv_p}-p_{page_size}"
1401
+ kernel = jax.named_scope(scope_name)(
1402
+ pl.pallas_call(
1403
+ functools.partial(
1404
+ _ragged_paged_attention_kernel,
1405
+ sm_scale=sm_scale,
1406
+ sliding_window=sliding_window,
1407
+ soft_cap=soft_cap,
1408
+ mask_value=mask_value,
1409
+ q_scale=q_scale,
1410
+ k_scale=k_scale,
1411
+ v_scale=v_scale,
1412
+ chunk_prefill_size=chunk_prefill_size,
1413
+ bq_sz=bq_sz,
1414
+ bkv_p=bkv_p,
1415
+ debug_mode=debug_mode,
1416
+ ),
1417
+ grid_spec=pltpu.PrefetchScalarGridSpec(
1418
+ num_scalar_prefetch=len(scalar_prefetches),
1419
+ in_specs=in_specs,
1420
+ out_specs=out_specs,
1421
+ grid=grid,
1422
+ scratch_shapes=scratch_shapes,
1423
+ ),
1424
+ compiler_params=pltpu.CompilerParams(
1425
+ # TODO(jevinjiang): since each sequence depends on the previous
1426
+ # one, we need some extra work to support Megacore mode.
1427
+ dimension_semantics=("arbitrary", ),
1428
+ vmem_limit_bytes=vmem_limit_bytes,
1429
+ ),
1430
+ out_shape=[
1431
+ jax.ShapeDtypeStruct(shape=q.shape, dtype=q.dtype),
1432
+ jax.ShapeDtypeStruct(shape=kv_cache.shape,
1433
+ dtype=kv_cache.dtype),
1434
+ ],
1435
+ input_output_aliases={
1436
+ 7: 0,
1437
+ 9: 1
1438
+ },
1439
+ name=scope_name,
1440
+ ))
1441
+
1442
+ output, updated_kv_cache = kernel(*scalar_prefetches, q, kv, kv_cache)
1443
+ return (
1444
+ prepare_outputs(output, actual_num_q_heads_per_kv_head,
1445
+ actual_head_dim),
1446
+ updated_kv_cache,
1447
+ )