tpu-inference 0.11.1.dev202511150811__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tpu-inference might be problematic. Click here for more details.

Files changed (179) hide show
  1. tests/__init__.py +0 -0
  2. tests/core/__init__.py +0 -0
  3. tests/core/test_core_tpu.py +513 -0
  4. tests/core/test_disagg_executor.py +60 -0
  5. tests/core/test_disagg_utils.py +53 -0
  6. tests/core/test_dp_scheduler.py +899 -0
  7. tests/core/test_init.py +49 -0
  8. tests/kernels/__init__.py +0 -0
  9. tests/kernels/fused_moe_v1_test.py +105 -0
  10. tests/kernels/mla_v1_test.py +396 -0
  11. tests/kernels/quantized_matmul_kernel_test.py +191 -0
  12. tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
  13. tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
  14. tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +549 -0
  15. tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
  16. tests/lora/__init__.py +0 -0
  17. tests/lora/conftest.py +32 -0
  18. tests/lora/test_bgmv.py +43 -0
  19. tests/lora/test_layers.py +654 -0
  20. tests/lora/test_lora.py +133 -0
  21. tests/lora/utils.py +96 -0
  22. tests/test_base.py +201 -0
  23. tests/test_envs.py +182 -0
  24. tests/test_quantization.py +836 -0
  25. tests/test_tpu_info.py +120 -0
  26. tests/test_utils.py +236 -0
  27. tpu_inference/__init__.py +34 -0
  28. tpu_inference/core/__init__.py +0 -0
  29. tpu_inference/core/core_tpu.py +786 -0
  30. tpu_inference/core/disagg_executor.py +118 -0
  31. tpu_inference/core/disagg_utils.py +51 -0
  32. tpu_inference/core/sched/__init__.py +0 -0
  33. tpu_inference/core/sched/dp_scheduler.py +523 -0
  34. tpu_inference/distributed/__init__.py +0 -0
  35. tpu_inference/distributed/jax_parallel_state.py +67 -0
  36. tpu_inference/distributed/tpu_connector.py +728 -0
  37. tpu_inference/distributed/utils.py +59 -0
  38. tpu_inference/env_override.py +9 -0
  39. tpu_inference/envs.py +107 -0
  40. tpu_inference/executors/__init__.py +0 -0
  41. tpu_inference/executors/ray_distributed_executor.py +362 -0
  42. tpu_inference/experimental/__init__.py +0 -0
  43. tpu_inference/experimental/llama3_jax_stashed.py +258 -0
  44. tpu_inference/kernels/__init__.py +0 -0
  45. tpu_inference/kernels/collectives/__init__.py +0 -0
  46. tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
  47. tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
  48. tpu_inference/kernels/collectives/util.py +47 -0
  49. tpu_inference/kernels/flash_attention/__init__.py +0 -0
  50. tpu_inference/kernels/flash_attention/kernel.py +772 -0
  51. tpu_inference/kernels/fused_moe/__init__.py +0 -0
  52. tpu_inference/kernels/fused_moe/v1/__init__.py +0 -0
  53. tpu_inference/kernels/fused_moe/v1/kernel.py +1035 -0
  54. tpu_inference/kernels/mla/__init__.py +0 -0
  55. tpu_inference/kernels/mla/v1/__init__.py +0 -0
  56. tpu_inference/kernels/mla/v1/kernel.py +1349 -0
  57. tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
  58. tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
  59. tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
  60. tpu_inference/kernels/quantized_matmul/util.py +58 -0
  61. tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
  62. tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
  63. tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
  64. tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
  65. tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
  66. tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
  67. tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1478 -0
  68. tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +1482 -0
  69. tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +4147 -0
  70. tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +367 -0
  71. tpu_inference/kernels/ragged_paged_attention/v3/util.py +51 -0
  72. tpu_inference/layers/__init__.py +0 -0
  73. tpu_inference/layers/common/__init__.py +0 -0
  74. tpu_inference/layers/common/attention_interface.py +390 -0
  75. tpu_inference/layers/common/attention_metadata.py +34 -0
  76. tpu_inference/layers/common/binary_search.py +295 -0
  77. tpu_inference/layers/common/quant_methods.py +8 -0
  78. tpu_inference/layers/common/sharding.py +582 -0
  79. tpu_inference/layers/jax/__init__.py +0 -0
  80. tpu_inference/layers/jax/attention/__init__.py +0 -0
  81. tpu_inference/layers/jax/attention/attention.py +255 -0
  82. tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
  83. tpu_inference/layers/jax/attention/gpt_oss_attention.py +262 -0
  84. tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
  85. tpu_inference/layers/jax/base.py +151 -0
  86. tpu_inference/layers/jax/constants.py +88 -0
  87. tpu_inference/layers/jax/layers.py +301 -0
  88. tpu_inference/layers/jax/misc.py +16 -0
  89. tpu_inference/layers/jax/moe/__init__.py +0 -0
  90. tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
  91. tpu_inference/layers/jax/moe/gpt_oss_moe.py +185 -0
  92. tpu_inference/layers/jax/moe/moe.py +209 -0
  93. tpu_inference/layers/jax/rope.py +280 -0
  94. tpu_inference/layers/jax/rope_interface.py +214 -0
  95. tpu_inference/layers/jax/sample/__init__.py +0 -0
  96. tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
  97. tpu_inference/layers/jax/sample/sampling.py +96 -0
  98. tpu_inference/layers/jax/sample/sampling_metadata.py +76 -0
  99. tpu_inference/layers/jax/transformer_block.py +107 -0
  100. tpu_inference/layers/vllm/__init__.py +0 -0
  101. tpu_inference/layers/vllm/attention.py +221 -0
  102. tpu_inference/layers/vllm/fused_moe.py +507 -0
  103. tpu_inference/layers/vllm/linear_common.py +186 -0
  104. tpu_inference/layers/vllm/quantization/__init__.py +39 -0
  105. tpu_inference/layers/vllm/quantization/awq.py +207 -0
  106. tpu_inference/layers/vllm/quantization/common.py +105 -0
  107. tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
  108. tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +120 -0
  109. tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +203 -0
  110. tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
  111. tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
  112. tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
  113. tpu_inference/layers/vllm/quantization/mxfp4.py +266 -0
  114. tpu_inference/layers/vllm/quantization/unquantized.py +386 -0
  115. tpu_inference/layers/vllm/sharding.py +230 -0
  116. tpu_inference/logger.py +10 -0
  117. tpu_inference/lora/__init__.py +0 -0
  118. tpu_inference/lora/torch_lora_ops.py +103 -0
  119. tpu_inference/lora/torch_punica_tpu.py +311 -0
  120. tpu_inference/mock/__init__.py +0 -0
  121. tpu_inference/mock/vllm_config_utils.py +28 -0
  122. tpu_inference/mock/vllm_envs.py +1219 -0
  123. tpu_inference/mock/vllm_logger.py +212 -0
  124. tpu_inference/mock/vllm_logging_utils.py +15 -0
  125. tpu_inference/models/__init__.py +0 -0
  126. tpu_inference/models/common/__init__.py +0 -0
  127. tpu_inference/models/common/model_loader.py +444 -0
  128. tpu_inference/models/jax/__init__.py +0 -0
  129. tpu_inference/models/jax/deepseek_v3.py +868 -0
  130. tpu_inference/models/jax/gpt_oss.py +492 -0
  131. tpu_inference/models/jax/jax_intermediate_tensor.py +79 -0
  132. tpu_inference/models/jax/llama3.py +375 -0
  133. tpu_inference/models/jax/llama4.py +629 -0
  134. tpu_inference/models/jax/llama_eagle3.py +333 -0
  135. tpu_inference/models/jax/phi3.py +376 -0
  136. tpu_inference/models/jax/qwen2.py +375 -0
  137. tpu_inference/models/jax/qwen2_5_vl.py +1103 -0
  138. tpu_inference/models/jax/qwen3.py +302 -0
  139. tpu_inference/models/jax/utils/__init__.py +0 -0
  140. tpu_inference/models/jax/utils/file_utils.py +96 -0
  141. tpu_inference/models/jax/utils/multi_modal_utils.py +163 -0
  142. tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
  143. tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +5 -0
  144. tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +6 -0
  145. tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +5 -0
  146. tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +6 -0
  147. tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +105 -0
  148. tpu_inference/models/jax/utils/quantization/quantization_utils.py +653 -0
  149. tpu_inference/models/jax/utils/weight_utils.py +529 -0
  150. tpu_inference/models/vllm/__init__.py +0 -0
  151. tpu_inference/models/vllm/vllm_model_wrapper.py +286 -0
  152. tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
  153. tpu_inference/platforms/__init__.py +2 -0
  154. tpu_inference/platforms/tpu_platform.py +269 -0
  155. tpu_inference/runner/__init__.py +0 -0
  156. tpu_inference/runner/block_table.py +122 -0
  157. tpu_inference/runner/compilation_manager.py +780 -0
  158. tpu_inference/runner/input_batch.py +435 -0
  159. tpu_inference/runner/kv_cache.py +132 -0
  160. tpu_inference/runner/kv_cache_manager.py +479 -0
  161. tpu_inference/runner/lora_utils.py +92 -0
  162. tpu_inference/runner/multimodal_manager.py +217 -0
  163. tpu_inference/runner/persistent_batch_manager.py +244 -0
  164. tpu_inference/runner/speculative_decoding_manager.py +248 -0
  165. tpu_inference/runner/structured_decoding_manager.py +88 -0
  166. tpu_inference/runner/tpu_runner.py +1620 -0
  167. tpu_inference/runner/utils.py +426 -0
  168. tpu_inference/spec_decode/__init__.py +0 -0
  169. tpu_inference/spec_decode/jax/__init__.py +0 -0
  170. tpu_inference/spec_decode/jax/eagle3.py +367 -0
  171. tpu_inference/tpu_info.py +77 -0
  172. tpu_inference/utils.py +317 -0
  173. tpu_inference/worker/__init__.py +0 -0
  174. tpu_inference/worker/tpu_worker.py +321 -0
  175. tpu_inference-0.11.1.dev202511150811.dist-info/METADATA +107 -0
  176. tpu_inference-0.11.1.dev202511150811.dist-info/RECORD +179 -0
  177. tpu_inference-0.11.1.dev202511150811.dist-info/WHEEL +5 -0
  178. tpu_inference-0.11.1.dev202511150811.dist-info/licenses/LICENSE +201 -0
  179. tpu_inference-0.11.1.dev202511150811.dist-info/top_level.txt +2 -0
@@ -0,0 +1,286 @@
1
+ import copy
2
+ import functools
3
+ from collections.abc import Sequence
4
+ from contextlib import nullcontext
5
+ from typing import Any, List, Optional, Tuple
6
+ from unittest.mock import patch
7
+
8
+ import jax
9
+ import torch
10
+ import torch.nn
11
+ import torchax
12
+ from flax.typing import PRNGKey
13
+ from jax.sharding import Mesh, NamedSharding, PartitionSpec
14
+ from torchax.interop import jax_view, torch_view
15
+ from torchax.ops.mappings import TORCH_DTYPE_TO_JAX
16
+ from vllm.config import VllmConfig
17
+ from vllm.forward_context import set_forward_context
18
+ from vllm.lora.layers import BaseLayerWithLoRA
19
+ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
20
+ from vllm.model_executor.model_loader import get_model as vllm_get_model
21
+ from vllm.model_executor.models import supports_lora, supports_multimodal
22
+ from vllm.sequence import IntermediateTensors
23
+
24
+ from tpu_inference.layers.common.attention_metadata import AttentionMetadata
25
+ from tpu_inference.layers.vllm.quantization import get_tpu_quantization_config
26
+ from tpu_inference.layers.vllm.sharding import shard_model_to_tpu
27
+ from tpu_inference.logger import init_logger
28
+ from tpu_inference.models.jax.jax_intermediate_tensor import \
29
+ JaxIntermediateTensors
30
+ from tpu_inference.models.vllm.vllm_model_wrapper_context import (
31
+ get_vllm_model_wrapper_context, set_vllm_model_wrapper_context)
32
+ from tpu_inference.runner.lora_utils import replace_lora_metadata
33
+
34
+ logger = init_logger(__name__)
35
+
36
+
37
+ class _VllmRunner(torch.nn.Module):
38
+
39
+ def __init__(self, vllm_model: torch.nn.Module):
40
+ super().__init__()
41
+ self.vllm_model = vllm_model
42
+
43
+ def forward(self, **kwargs) -> torch.Tensor:
44
+ if "hidden_state" in kwargs:
45
+ return self.compute_logits(kwargs["hidden_state"])
46
+ else:
47
+ return self.compute_hidden_state(
48
+ kwargs["input_ids"],
49
+ kwargs["positions"],
50
+ kwargs["intermediate_tensors"],
51
+ kwargs["inputs_embeds"],
52
+ )
53
+
54
+ def compute_hidden_state(
55
+ self,
56
+ input_ids: torch.Tensor,
57
+ positions: torch.Tensor,
58
+ intermediate_tensors: Optional[IntermediateTensors],
59
+ inputs_embeds: Optional[torch.Tensor],
60
+ ) -> torch.Tensor:
61
+ hidden_state = self.vllm_model(input_ids, positions,
62
+ intermediate_tensors, inputs_embeds)
63
+ return hidden_state
64
+
65
+ def compute_logits(self, hidden_state: torch.Tensor) -> torch.Tensor:
66
+ return self.vllm_model.compute_logits(hidden_state)
67
+
68
+
69
+ class VllmModelWrapper:
70
+ """ Wraps a vLLM Pytorch model and let it run on the JAX engine. """
71
+
72
+ rng: PRNGKey
73
+ mesh: Mesh
74
+ model: _VllmRunner
75
+
76
+ def __init__(self, vllm_config: VllmConfig, rng: PRNGKey, mesh: Mesh):
77
+ self.vllm_config = vllm_config
78
+ self.rng = rng
79
+ self.mesh = mesh
80
+
81
+ self.vllm_config.quant_config = get_tpu_quantization_config(
82
+ self.vllm_config, self.mesh)
83
+
84
+ def load_weights(self):
85
+ # Set up to load the model into CPU first.
86
+ # Cache device slice config since device config cannot be deepcopied
87
+ modified_slice_config = False
88
+ if hasattr(
89
+ self.vllm_config.device_config,
90
+ 'slice') and self.vllm_config.device_config.slice is not None:
91
+ slice_config = self.vllm_config.device_config.slice
92
+ modified_slice_config = True
93
+ self.vllm_config.device_config.slice = None
94
+ self.vllm_config.compilation_config.static_forward_context.clear()
95
+
96
+ vllm_config_for_load = copy.deepcopy(self.vllm_config)
97
+ if modified_slice_config:
98
+ self.vllm_config.device_config.slice = slice_config
99
+ assert self.vllm_config.model_config.dtype in TORCH_DTYPE_TO_JAX, "The model_config.dtype must be a PyTorch dtype."
100
+ vllm_config_for_load.device_config.device = "cpu"
101
+ # Clearing the cached compilation config, otherwise vllm model init will fail
102
+
103
+ # When expert parallelism is enabled, vLLM loads weight in sharding
104
+ # aware manner. Since tpu-inference has its own sharding logic, this
105
+ # may casue errors. Therefore, we disable it during weight loading.
106
+ vllm_config_for_load.parallel_config.enable_expert_parallel = False
107
+
108
+ use_random_weights = (
109
+ vllm_config_for_load.load_config.load_format == "dummy")
110
+ if use_random_weights:
111
+ logger.info(
112
+ "Initializing vLLM model with random weights, weight loading skipped."
113
+ )
114
+ # The DummyModelLoader in vLLM calls torch._sync for torch_xla path when
115
+ # it detects the tpu platform, but we don't need it and it causes crash
116
+ # without proper setup.
117
+ load_context = patch(
118
+ "torch._sync",
119
+ return_value=None) if use_random_weights else nullcontext()
120
+
121
+ # Load the vLLM model and wrap it into a new model whose forward
122
+ # function can calculate the hidden_state and logits.
123
+ with load_context:
124
+ vllm_model = vllm_get_model(vllm_config=vllm_config_for_load)
125
+ lora_manager = None
126
+ if vllm_config_for_load.lora_config is not None:
127
+ # Replace layers in the model with LoRA layers.
128
+ with torchax.default_env():
129
+ # Argument "device" in load_lora_model is used to set the device
130
+ # used in punica wrapper.
131
+ lora_manager, vllm_model = load_lora_model(
132
+ vllm_model, vllm_config_for_load, device="jax")
133
+ replace_set_lora(vllm_model)
134
+
135
+ static_forward_context = vllm_config_for_load.compilation_config.static_forward_context
136
+ self.vllm_config.compilation_config.static_forward_context = static_forward_context
137
+
138
+ self.model = _VllmRunner(vllm_model)
139
+ params_and_buffers = shard_model_to_tpu(self.model, self.mesh)
140
+
141
+ # Returning to the jax land, so we need to wrap it into a JaxValue.
142
+ return jax_view(params_and_buffers), lora_manager
143
+
144
+ def jit_step_func(self):
145
+
146
+ @functools.partial(
147
+ jax.jit,
148
+ donate_argnames=("kv_caches", ),
149
+ compiler_options={
150
+ "xla_tpu_all_gather_collective_matmul_mode":
151
+ "post_spmd_conservative",
152
+ "xla_tpu_reduce_scatter_collective_matmul_mode":
153
+ "post_spmd_conservative"
154
+ },
155
+ static_argnames=("layer_name_to_kvcache_index", "is_first_rank",
156
+ "is_last_rank"),
157
+ )
158
+ def step_fun(
159
+ params_and_buffers, # This has been wrapped into torchax TorchValue
160
+ kv_caches: List[jax.Array],
161
+ input_ids: jax.Array,
162
+ attn_metadata: AttentionMetadata,
163
+ input_embeds: jax.Array,
164
+ layer_name_to_kvcache_index: Sequence[Tuple[str, int]],
165
+ lora_metadata,
166
+ intermediate_tensors: JaxIntermediateTensors = None,
167
+ is_first_rank: bool = True,
168
+ is_last_rank: bool = True,
169
+ *args,
170
+ ) -> Tuple[List[jax.Array], jax.Array]:
171
+ layer_name_to_kvcache_index = dict(layer_name_to_kvcache_index)
172
+ lora_metadata = torch_view(lora_metadata)
173
+ with torchax.default_env(), set_vllm_model_wrapper_context(
174
+ kv_caches=kv_caches,
175
+ mesh=self.mesh,
176
+ layer_name_to_kvcache_index=layer_name_to_kvcache_index
177
+ ), set_forward_context(attn_metadata=attn_metadata,
178
+ vllm_config=self.vllm_config):
179
+ # We need to wrap args from jax land into TorchValue with
180
+ # torch_view in order to call the Torch function.
181
+ original_lora_metadata = replace_lora_metadata(
182
+ self.model, lora_metadata, self.vllm_config.lora_config)
183
+ if not is_first_rank:
184
+ intermediate_tensors = intermediate_tensors.to_torch()
185
+ output_from_torch = torch.func.functional_call(
186
+ self.model,
187
+ torch_view(params_and_buffers),
188
+ kwargs={
189
+ "input_ids": torch_view(input_ids),
190
+ "positions": torch_view(attn_metadata.input_positions),
191
+ "intermediate_tensors": intermediate_tensors,
192
+ "inputs_embeds": None,
193
+ },
194
+ tie_weights=False,
195
+ )
196
+ replace_lora_metadata(self.model, original_lora_metadata,
197
+ self.vllm_config.lora_config)
198
+ vllm_model_wrapper_context = get_vllm_model_wrapper_context()
199
+ new_kv_caches = vllm_model_wrapper_context.kv_caches
200
+ # Wrap the output(hidden states or intermediate tensor)
201
+ # from torch land into a JaxValue for the jax code to consume.
202
+ if not is_last_rank:
203
+ output = JaxIntermediateTensors.from_torch(output_from_torch)
204
+ else:
205
+ output = jax_view(output_from_torch)
206
+ return new_kv_caches, output, []
207
+
208
+ return step_fun
209
+
210
+ def jit_compute_logits_func(self):
211
+
212
+ @functools.partial(
213
+ jax.jit,
214
+ out_shardings=(NamedSharding(self.mesh,
215
+ PartitionSpec(None, "model"))),
216
+ )
217
+ def compute_logits_func(
218
+ params_and_buffers: Any,
219
+ hidden_states: jax.Array,
220
+ lora_metadata,
221
+ ) -> jax.Array:
222
+ lora_metadata = torch_view(lora_metadata)
223
+ with torchax.default_env(), set_vllm_model_wrapper_context(
224
+ kv_caches=None, mesh=self.mesh):
225
+ original_lora_metadata = replace_lora_metadata(
226
+ self.model, lora_metadata, self.vllm_config.lora_config)
227
+ logits = torch.func.functional_call(
228
+ self.model,
229
+ torch_view(params_and_buffers),
230
+ kwargs={
231
+ "hidden_state": torch_view(hidden_states),
232
+ },
233
+ tie_weights=False,
234
+ )
235
+ replace_lora_metadata(self.model, original_lora_metadata,
236
+ self.vllm_config.lora_config)
237
+ return jax_view(logits)
238
+
239
+ return compute_logits_func
240
+
241
+
242
+ def load_lora_model(model: torch.nn.Module, vllm_config: VllmConfig,
243
+ device: str) -> torch.nn.Module:
244
+ if not supports_lora(model):
245
+ raise ValueError(
246
+ f"{model.__class__.__name__} does not support LoRA yet.")
247
+
248
+ if supports_multimodal(model):
249
+ logger.warning("Regarding multimodal models, vLLM currently "
250
+ "only supports adding LoRA to language model.")
251
+
252
+ # Add LoRA Manager to the Model Runner
253
+ lora_manager = LRUCacheWorkerLoRAManager(
254
+ vllm_config,
255
+ device,
256
+ model.embedding_modules,
257
+ model.embedding_padding_modules,
258
+ )
259
+ return lora_manager, lora_manager.create_lora_manager(model)
260
+
261
+
262
+ # The reason why replace the method is that the set_lora and reset_lora need to
263
+ # run under torchax env.
264
+ def replace_set_lora(model):
265
+
266
+ def _tpu_set_lora(
267
+ self,
268
+ index: int,
269
+ lora_a: torch.Tensor,
270
+ lora_b: torch.Tensor,
271
+ embeddings_tensor: Optional[torch.Tensor],
272
+ ):
273
+ with torchax.default_env():
274
+ self._original_set_lora(index, lora_a, lora_b, embeddings_tensor)
275
+
276
+ def _tpu_reset_lora(self, index: int):
277
+ with torchax.default_env():
278
+ self._original_reset_lora(index)
279
+
280
+ for _, module in model.named_modules():
281
+ if isinstance(module, BaseLayerWithLoRA):
282
+ module._original_set_lora = module.set_lora
283
+ module._original_reset_lora = module.reset_lora
284
+ module.set_lora = _tpu_set_lora.__get__(module, module.__class__)
285
+ module.reset_lora = _tpu_reset_lora.__get__(
286
+ module, module.__class__)
@@ -0,0 +1,45 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List, Optional
4
+
5
+ import jax
6
+ from jax.sharding import Mesh
7
+
8
+
9
+ @dataclass
10
+ class VllmModelWrapperContext:
11
+ kv_caches: List[jax.Array]
12
+ mesh: Mesh
13
+ layer_name_to_kvcache_index: Dict[str, int]
14
+
15
+
16
+ _vllm_model_wrapper_context: Optional[VllmModelWrapperContext] = None
17
+
18
+
19
+ def get_vllm_model_wrapper_context() -> VllmModelWrapperContext:
20
+ assert _vllm_model_wrapper_context is not None, (
21
+ "VllmModelWrapperContext is not set. "
22
+ "Please use `set_vllm_model_wrapper_context` to set the VllmModelWrapperContext."
23
+ )
24
+ return _vllm_model_wrapper_context
25
+
26
+
27
+ @contextmanager
28
+ def set_vllm_model_wrapper_context(
29
+ *,
30
+ kv_caches: List[jax.Array],
31
+ mesh: Mesh,
32
+ layer_name_to_kvcache_index: Dict[str, int] = None,
33
+ ):
34
+ global _vllm_model_wrapper_context
35
+ prev_context = _vllm_model_wrapper_context
36
+ _vllm_model_wrapper_context = VllmModelWrapperContext(
37
+ kv_caches=kv_caches,
38
+ mesh=mesh,
39
+ layer_name_to_kvcache_index=layer_name_to_kvcache_index,
40
+ )
41
+
42
+ try:
43
+ yield
44
+ finally:
45
+ _vllm_model_wrapper_context = prev_context
@@ -0,0 +1,2 @@
1
+ # ruff: noqa
2
+ from tpu_inference.platforms.tpu_platform import TpuPlatform
@@ -0,0 +1,269 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import os
4
+ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union, cast
5
+
6
+ import jax.numpy as jnp
7
+ import vllm.envs as vllm_envs
8
+ from torchax.ops.mappings import j2t_dtype
9
+ from tpu_info import device
10
+ from vllm.inputs import ProcessorInputs, PromptType
11
+ from vllm.platforms.interface import Platform, PlatformEnum
12
+ from vllm.sampling_params import SamplingParams, SamplingType
13
+
14
+ from tpu_inference import envs
15
+ from tpu_inference.layers.common.sharding import ShardingConfigManager
16
+ from tpu_inference.logger import init_logger
17
+
18
+ if TYPE_CHECKING:
19
+ from vllm.attention.backends.registry import _Backend
20
+ from vllm.config import BlockSize, ModelConfig, VllmConfig
21
+ from vllm.pooling_params import PoolingParams
22
+ else:
23
+ BlockSize = None
24
+ ModelConfig = None
25
+ VllmConfig = None
26
+ PoolingParams = None
27
+ _Backend = None
28
+
29
+ logger = init_logger(__name__)
30
+
31
+ _DTYPE: dict[str, jnp.dtype] = {
32
+ "bfloat16": jnp.bfloat16,
33
+ "float": jnp.float32,
34
+ "float32": jnp.float32,
35
+ }
36
+
37
+
38
+ class TpuPlatform(Platform):
39
+ _enum = PlatformEnum.TPU
40
+ device_name: str = "tpu"
41
+ device_type: str = "tpu"
42
+ dispatch_key: str = "XLA"
43
+ ray_device_key: str = "TPU"
44
+ device_control_env_var: str = "TPU_VISIBLE_CHIPS"
45
+ simple_compile_backend: str = "openxla"
46
+
47
+ supported_quantization: list[str] = [
48
+ "tpu_int8", "compressed-tensors", "awq", "fp8", "mxfp4"
49
+ ]
50
+
51
+ additional_env_vars: list[str] = [
52
+ "PHASED_PROFILING_DIR", "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS",
53
+ "TPU_MULTIHOST_BACKEND", "VLLM_MLA_DISABLE", "TPU_BACKEND_TYPE"
54
+ ]
55
+
56
+ @classmethod
57
+ def get_attn_backend_cls(cls, selected_backend: "_Backend", head_size: int,
58
+ dtype: jnp.dtype, kv_cache_dtype: Optional[str],
59
+ block_size: int, use_v1: bool, use_mla: bool,
60
+ has_sink: bool, use_sparse: bool,
61
+ attn_type: Any) -> str:
62
+ from vllm.attention.backends.registry import _Backend
63
+ if selected_backend != _Backend.PALLAS:
64
+ logger.info("Cannot use %s backend on TPU.", selected_backend)
65
+
66
+ if use_v1:
67
+ logger.info("Using Pallas V1 backend.")
68
+ return "tpu_inference.layers.vllm.attention.PallasAttentionBackend"
69
+ else:
70
+ logger.info("Using Pallas backend.")
71
+ return "vllm.attention.backends.pallas.PallasAttentionBackend"
72
+
73
+ @classmethod
74
+ def get_device_name(cls, device_id: int = 0) -> str:
75
+ try:
76
+ if vllm_envs.VLLM_TPU_USING_PATHWAYS:
77
+ # Causes mutliprocess accessing IFRT when calling jax.devices()
78
+ return "TPU v6 lite"
79
+ else:
80
+ chip_type, _ = device.get_local_chips()
81
+ return f"TPU {chip_type.name}"
82
+ except Exception as e:
83
+ logger.warning(f"Error getting device name: {e}")
84
+ return 'TPU'
85
+
86
+ @classmethod
87
+ def get_device_total_memory(cls, device_id: int = 0) -> int:
88
+ raise NotImplementedError
89
+
90
+ @classmethod
91
+ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
92
+ return False
93
+
94
+ @classmethod
95
+ def get_punica_wrapper(cls) -> str:
96
+ return "tpu_inference.lora.torch_punica_tpu.PunicaWrapperTPU"
97
+
98
+ @classmethod
99
+ def get_infinity_values(cls, dtype: jnp.dtype) -> Tuple[float, float]:
100
+ return jnp.finfo(dtype).min, jnp.finfo(dtype).max
101
+
102
+ @classmethod
103
+ def can_update_inplace(cls):
104
+ return False
105
+
106
+ @classmethod
107
+ def get_lora_vocab_padding_size(cls) -> int:
108
+ return 1
109
+
110
+ @classmethod
111
+ def inference_mode(cls):
112
+ return True
113
+
114
+ @classmethod
115
+ def _initialize_sharding_config(cls, vllm_config: VllmConfig) -> None:
116
+
117
+ sharding_config = ShardingConfigManager.from_vllm_config(vllm_config)
118
+ vllm_config.sharding_config = sharding_config
119
+ logger.info(f"Initialized sharding configuration: {sharding_config}")
120
+
121
+ @classmethod
122
+ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
123
+
124
+ if vllm_envs.VLLM_TPU_USING_PATHWAYS:
125
+ assert not vllm_envs.VLLM_ENABLE_V1_MULTIPROCESSING, (
126
+ "VLLM_ENABLE_V1_MULTIPROCESSING must be 0 when using Pathways(JAX_PLATFORMS=proxy)"
127
+ )
128
+ cls._initialize_sharding_config(vllm_config)
129
+
130
+ from vllm.config import CompilationMode
131
+
132
+ cache_config = vllm_config.cache_config
133
+ # For v0, the default block size is 16.
134
+ if cache_config and cache_config.block_size is None:
135
+ cache_config.block_size = cast(BlockSize, 16)
136
+ compilation_config = vllm_config.compilation_config
137
+
138
+ # TPU only supports DYNAMO_TRACE_ONCE compilation level
139
+ # NOTE(xiang): the compilation_config is not used by jax.
140
+ if compilation_config.mode != CompilationMode.DYNAMO_TRACE_ONCE:
141
+ compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
142
+
143
+ if compilation_config.backend == "":
144
+ compilation_config.backend = "openxla"
145
+
146
+ # If we use vLLM's model implementation in PyTorch, we should set it with torch version of the dtype.
147
+ impl = envs.MODEL_IMPL_TYPE
148
+
149
+ # NOTE(xiang): convert dtype to jnp.dtype
150
+ # NOTE(wenlong): skip this logic for mm model preprocessing
151
+ # For mm model preprocessors, it may need the output dtype to be torch.
152
+ # In order to avoid a PR to vLLM, we postpone the dtype checking during tpu_worker initialization
153
+ if not vllm_config.scheduler_config.is_multimodal_model or impl == "vllm":
154
+ if not isinstance(vllm_config.model_config.dtype, str):
155
+ logger.warning(
156
+ "The model dtype is not properly set for JAX backend. "
157
+ "Overwriting it to jnp.bfloat16")
158
+ vllm_config.model_config.dtype = jnp.bfloat16
159
+ else:
160
+ vllm_config.model_config.dtype = _DTYPE.get(
161
+ vllm_config.model_config.dtype, jnp.bfloat16)
162
+
163
+ if impl == "vllm":
164
+ vllm_config.model_config.dtype = j2t_dtype(
165
+ vllm_config.model_config.dtype.dtype)
166
+
167
+ # TODO(cuiq): remove this dependency.
168
+ from vllm.v1.attention.backends.pallas import PallasAttentionBackend
169
+ cache_config.block_size = PallasAttentionBackend.get_page_size(
170
+ vllm_config) # type: ignore[assignment]
171
+ min_page_size = PallasAttentionBackend.get_min_page_size(vllm_config)
172
+ if min_page_size > cache_config.block_size:
173
+ logger.warning(
174
+ "Increase the page size from %s to %s to make sure there's"
175
+ "no SMEM OOM",
176
+ cache_config.block_size,
177
+ min_page_size,
178
+ )
179
+ cache_config.block_size = min_page_size # type: ignore[assignment]
180
+
181
+ parallel_config = vllm_config.parallel_config
182
+ scheduler_config = vllm_config.scheduler_config
183
+ parallel_config.worker_cls = \
184
+ "tpu_inference.worker.tpu_worker.TPUWorker"
185
+
186
+ multihost_backend = os.environ.get("TPU_MULTIHOST_BACKEND", "").lower()
187
+ if not multihost_backend: # Single host
188
+ if parallel_config.pipeline_parallel_size == 1:
189
+ logger.info("Force using UniProcExecutor for JAX on \
190
+ single host without pipeline parallelism.")
191
+ parallel_config.distributed_executor_backend = "uni"
192
+ else:
193
+ logger.info("Force using MultiprocExecutor for JAX on \
194
+ single host with pipeline parallelism.")
195
+ parallel_config.distributed_executor_backend = "mp"
196
+ elif multihost_backend == "ray":
197
+ from tpu_inference.executors.ray_distributed_executor import \
198
+ RayDistributedExecutor
199
+ parallel_config.distributed_executor_backend = RayDistributedExecutor
200
+ logger.info(
201
+ "Force using RayDistributedExecutor for JAX on multihost.")
202
+ else:
203
+ logger.warning(
204
+ f"Unknown TPU multihost backend: {multihost_backend}. "
205
+ "Using uniproc_executor.")
206
+ parallel_config.distributed_executor_backend = "uni"
207
+
208
+ if scheduler_config.is_multimodal_model and not \
209
+ scheduler_config.disable_chunked_mm_input:
210
+ logger.warning("TPU does not support running Multimodal models"\
211
+ " without setting `--disable_chunked_mm_input`. " \
212
+ "Forcing --disable_chunked_mm_input.")
213
+ scheduler_config.disable_chunked_mm_input = True
214
+
215
+ kv_transfer_config = vllm_config.kv_transfer_config
216
+ if kv_transfer_config is not None:
217
+ assert kv_transfer_config.kv_connector == "TPUConnector"
218
+ # Late initialization to avoid circular import
219
+ from tpu_inference.models.jax.utils.quantization.quantization_utils import \
220
+ update_vllm_config_for_qwix_quantization
221
+
222
+ update_vllm_config_for_qwix_quantization(vllm_config)
223
+
224
+ from tpu_inference.core.sched.dp_scheduler import \
225
+ update_vllm_config_for_dp_scheduler
226
+ update_vllm_config_for_dp_scheduler(vllm_config)
227
+
228
+ @classmethod
229
+ def is_pin_memory_available(cls):
230
+ logger.warning("Pin memory is not supported on TPU.")
231
+ return False
232
+
233
+ @classmethod
234
+ def get_device_communicator_cls(cls) -> str:
235
+ return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator" # noqa
236
+
237
+ @classmethod
238
+ def use_all_gather(cls) -> bool:
239
+ return True
240
+
241
+ @classmethod
242
+ def supports_v1(cls, model_config: ModelConfig) -> bool:
243
+ # V1 support on TPU is experimental
244
+ return True
245
+
246
+ @classmethod
247
+ def validate_request(
248
+ cls,
249
+ prompt: PromptType,
250
+ params: Union[SamplingParams, PoolingParams],
251
+ processed_inputs: ProcessorInputs,
252
+ ) -> None:
253
+ """Raises if this request is unsupported on this platform"""
254
+
255
+ if isinstance(params, SamplingParams):
256
+ if params.sampling_type == SamplingType.RANDOM_SEED:
257
+ raise ValueError("JAX does not support per-request seed.")
258
+
259
+ @classmethod
260
+ def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str,
261
+ model_config: ModelConfig) -> bool:
262
+ return True
263
+
264
+ @classmethod
265
+ def use_sync_weight_loader(cls) -> bool:
266
+ """
267
+ Returns if the current platform needs to sync weight loader.
268
+ """
269
+ return True
File without changes