vllm-cpu 0.12.0__cp313-cp313-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1600) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +107 -0
  3. vllm/_aiter_ops.py +1018 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +2925 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +434 -0
  16. vllm/attention/backends/registry.py +286 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +975 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +120 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/ops/__init__.py +0 -0
  24. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  25. vllm/attention/ops/common.py +469 -0
  26. vllm/attention/ops/flashmla.py +251 -0
  27. vllm/attention/ops/merge_attn_states.py +47 -0
  28. vllm/attention/ops/paged_attn.py +51 -0
  29. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  30. vllm/attention/ops/prefix_prefill.py +814 -0
  31. vllm/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  32. vllm/attention/ops/triton_decode_attention.py +712 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +116 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  35. vllm/attention/ops/triton_unified_attention.py +941 -0
  36. vllm/attention/ops/vit_attn_wrappers.py +136 -0
  37. vllm/attention/selector.py +268 -0
  38. vllm/attention/utils/__init__.py +0 -0
  39. vllm/attention/utils/fa_utils.py +117 -0
  40. vllm/attention/utils/kv_sharing_utils.py +33 -0
  41. vllm/attention/utils/kv_transfer_utils.py +60 -0
  42. vllm/beam_search.py +88 -0
  43. vllm/benchmarks/__init__.py +0 -0
  44. vllm/benchmarks/datasets.py +3222 -0
  45. vllm/benchmarks/latency.py +172 -0
  46. vllm/benchmarks/lib/__init__.py +3 -0
  47. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  48. vllm/benchmarks/lib/ready_checker.py +72 -0
  49. vllm/benchmarks/lib/utils.py +79 -0
  50. vllm/benchmarks/serve.py +1531 -0
  51. vllm/benchmarks/sweep/__init__.py +0 -0
  52. vllm/benchmarks/sweep/cli.py +41 -0
  53. vllm/benchmarks/sweep/param_sweep.py +91 -0
  54. vllm/benchmarks/sweep/plot.py +580 -0
  55. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  56. vllm/benchmarks/sweep/serve.py +448 -0
  57. vllm/benchmarks/sweep/serve_sla.py +492 -0
  58. vllm/benchmarks/sweep/server.py +114 -0
  59. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  60. vllm/benchmarks/sweep/utils.py +4 -0
  61. vllm/benchmarks/throughput.py +799 -0
  62. vllm/collect_env.py +857 -0
  63. vllm/compilation/__init__.py +0 -0
  64. vllm/compilation/activation_quant_fusion.py +209 -0
  65. vllm/compilation/backends.py +827 -0
  66. vllm/compilation/base_static_graph.py +57 -0
  67. vllm/compilation/caching.py +180 -0
  68. vllm/compilation/collective_fusion.py +1234 -0
  69. vllm/compilation/compiler_interface.py +639 -0
  70. vllm/compilation/counter.py +48 -0
  71. vllm/compilation/cuda_graph.py +208 -0
  72. vllm/compilation/decorators.py +614 -0
  73. vllm/compilation/fix_functionalization.py +253 -0
  74. vllm/compilation/fusion.py +374 -0
  75. vllm/compilation/fusion_attn.py +359 -0
  76. vllm/compilation/fx_utils.py +91 -0
  77. vllm/compilation/inductor_pass.py +133 -0
  78. vllm/compilation/matcher_utils.py +315 -0
  79. vllm/compilation/monitor.py +62 -0
  80. vllm/compilation/noop_elimination.py +134 -0
  81. vllm/compilation/partition_rules.py +72 -0
  82. vllm/compilation/pass_manager.py +136 -0
  83. vllm/compilation/piecewise_backend.py +121 -0
  84. vllm/compilation/post_cleanup.py +21 -0
  85. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  86. vllm/compilation/sequence_parallelism.py +363 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  88. vllm/compilation/vllm_inductor_pass.py +173 -0
  89. vllm/compilation/wrapper.py +260 -0
  90. vllm/config/__init__.py +102 -0
  91. vllm/config/cache.py +220 -0
  92. vllm/config/compilation.py +1154 -0
  93. vllm/config/device.py +75 -0
  94. vllm/config/ec_transfer.py +110 -0
  95. vllm/config/kv_events.py +56 -0
  96. vllm/config/kv_transfer.py +114 -0
  97. vllm/config/load.py +124 -0
  98. vllm/config/lora.py +96 -0
  99. vllm/config/model.py +2274 -0
  100. vllm/config/multimodal.py +247 -0
  101. vllm/config/observability.py +131 -0
  102. vllm/config/parallel.py +653 -0
  103. vllm/config/pooler.py +124 -0
  104. vllm/config/scheduler.py +297 -0
  105. vllm/config/speculative.py +643 -0
  106. vllm/config/speech_to_text.py +38 -0
  107. vllm/config/structured_outputs.py +94 -0
  108. vllm/config/utils.py +324 -0
  109. vllm/config/vllm.py +1353 -0
  110. vllm/connections.py +189 -0
  111. vllm/device_allocator/__init__.py +0 -0
  112. vllm/device_allocator/cumem.py +327 -0
  113. vllm/distributed/__init__.py +6 -0
  114. vllm/distributed/communication_op.py +43 -0
  115. vllm/distributed/device_communicators/__init__.py +0 -0
  116. vllm/distributed/device_communicators/all2all.py +490 -0
  117. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  118. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  119. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  120. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  121. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  122. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  123. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  124. vllm/distributed/device_communicators/pynccl.py +386 -0
  125. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  126. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  127. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  128. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  129. vllm/distributed/device_communicators/shm_broadcast.py +733 -0
  130. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  131. vllm/distributed/device_communicators/symm_mem.py +156 -0
  132. vllm/distributed/device_communicators/tpu_communicator.py +99 -0
  133. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  134. vllm/distributed/ec_transfer/__init__.py +14 -0
  135. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  136. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  137. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  138. vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +201 -0
  139. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  140. vllm/distributed/eplb/__init__.py +8 -0
  141. vllm/distributed/eplb/async_worker.py +115 -0
  142. vllm/distributed/eplb/eplb_state.py +1154 -0
  143. vllm/distributed/eplb/rebalance_algo.py +260 -0
  144. vllm/distributed/eplb/rebalance_execute.py +532 -0
  145. vllm/distributed/kv_events.py +371 -0
  146. vllm/distributed/kv_transfer/README.md +29 -0
  147. vllm/distributed/kv_transfer/__init__.py +20 -0
  148. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  150. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  151. vllm/distributed/kv_transfer/kv_connector/factory.py +192 -0
  152. vllm/distributed/kv_transfer/kv_connector/utils.py +268 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/base.py +575 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +216 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +378 -0
  159. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1411 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +895 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +189 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +454 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2480 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +538 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  169. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  170. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +450 -0
  171. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  172. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +179 -0
  173. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +164 -0
  174. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +242 -0
  175. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  176. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  177. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +295 -0
  178. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +285 -0
  179. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  180. vllm/distributed/parallel_state.py +1790 -0
  181. vllm/distributed/tpu_distributed_utils.py +188 -0
  182. vllm/distributed/utils.py +545 -0
  183. vllm/engine/__init__.py +0 -0
  184. vllm/engine/arg_utils.py +2106 -0
  185. vllm/engine/async_llm_engine.py +6 -0
  186. vllm/engine/llm_engine.py +6 -0
  187. vllm/engine/protocol.py +188 -0
  188. vllm/entrypoints/__init__.py +0 -0
  189. vllm/entrypoints/anthropic/__init__.py +0 -0
  190. vllm/entrypoints/anthropic/protocol.py +162 -0
  191. vllm/entrypoints/anthropic/serving_messages.py +460 -0
  192. vllm/entrypoints/api_server.py +184 -0
  193. vllm/entrypoints/chat_utils.py +1837 -0
  194. vllm/entrypoints/cli/__init__.py +13 -0
  195. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  196. vllm/entrypoints/cli/benchmark/base.py +25 -0
  197. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  198. vllm/entrypoints/cli/benchmark/main.py +56 -0
  199. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  200. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  201. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  202. vllm/entrypoints/cli/collect_env.py +38 -0
  203. vllm/entrypoints/cli/main.py +79 -0
  204. vllm/entrypoints/cli/openai.py +256 -0
  205. vllm/entrypoints/cli/run_batch.py +68 -0
  206. vllm/entrypoints/cli/serve.py +249 -0
  207. vllm/entrypoints/cli/types.py +29 -0
  208. vllm/entrypoints/constants.py +10 -0
  209. vllm/entrypoints/context.py +572 -0
  210. vllm/entrypoints/dynamic_lora.py +57 -0
  211. vllm/entrypoints/harmony_utils.py +535 -0
  212. vllm/entrypoints/launcher.py +175 -0
  213. vllm/entrypoints/llm.py +1762 -0
  214. vllm/entrypoints/logger.py +84 -0
  215. vllm/entrypoints/openai/__init__.py +0 -0
  216. vllm/entrypoints/openai/api_server.py +1891 -0
  217. vllm/entrypoints/openai/cli_args.py +302 -0
  218. vllm/entrypoints/openai/orca_metrics.py +120 -0
  219. vllm/entrypoints/openai/protocol.py +2465 -0
  220. vllm/entrypoints/openai/run_batch.py +631 -0
  221. vllm/entrypoints/openai/serving_chat.py +1782 -0
  222. vllm/entrypoints/openai/serving_completion.py +716 -0
  223. vllm/entrypoints/openai/serving_engine.py +1478 -0
  224. vllm/entrypoints/openai/serving_models.py +304 -0
  225. vllm/entrypoints/openai/serving_responses.py +2032 -0
  226. vllm/entrypoints/openai/serving_tokenization.py +203 -0
  227. vllm/entrypoints/openai/serving_tokens.py +281 -0
  228. vllm/entrypoints/openai/serving_transcription.py +168 -0
  229. vllm/entrypoints/openai/speech_to_text.py +559 -0
  230. vllm/entrypoints/openai/tool_parsers/__init__.py +142 -0
  231. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +273 -0
  232. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +390 -0
  233. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +390 -0
  234. vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +210 -0
  235. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +200 -0
  236. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  237. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +253 -0
  238. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +494 -0
  239. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  240. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +227 -0
  241. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +322 -0
  242. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +590 -0
  243. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  244. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +324 -0
  245. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +37 -0
  246. vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +643 -0
  247. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +849 -0
  248. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +390 -0
  249. vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py +366 -0
  250. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +97 -0
  251. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +120 -0
  252. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +332 -0
  253. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +781 -0
  254. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  255. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +744 -0
  256. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +303 -0
  257. vllm/entrypoints/openai/tool_parsers/utils.py +229 -0
  258. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +556 -0
  259. vllm/entrypoints/openai/utils.py +49 -0
  260. vllm/entrypoints/pooling/__init__.py +16 -0
  261. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  262. vllm/entrypoints/pooling/classify/api_router.py +50 -0
  263. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  264. vllm/entrypoints/pooling/classify/serving.py +237 -0
  265. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  266. vllm/entrypoints/pooling/embed/api_router.py +67 -0
  267. vllm/entrypoints/pooling/embed/protocol.py +208 -0
  268. vllm/entrypoints/pooling/embed/serving.py +697 -0
  269. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  270. vllm/entrypoints/pooling/pooling/api_router.py +63 -0
  271. vllm/entrypoints/pooling/pooling/protocol.py +148 -0
  272. vllm/entrypoints/pooling/pooling/serving.py +348 -0
  273. vllm/entrypoints/pooling/score/__init__.py +0 -0
  274. vllm/entrypoints/pooling/score/api_router.py +149 -0
  275. vllm/entrypoints/pooling/score/protocol.py +145 -0
  276. vllm/entrypoints/pooling/score/serving.py +505 -0
  277. vllm/entrypoints/renderer.py +409 -0
  278. vllm/entrypoints/responses_utils.py +148 -0
  279. vllm/entrypoints/sagemaker/__init__.py +4 -0
  280. vllm/entrypoints/sagemaker/routes.py +118 -0
  281. vllm/entrypoints/score_utils.py +240 -0
  282. vllm/entrypoints/ssl.py +78 -0
  283. vllm/entrypoints/tool.py +143 -0
  284. vllm/entrypoints/tool_server.py +234 -0
  285. vllm/entrypoints/utils.py +319 -0
  286. vllm/env_override.py +378 -0
  287. vllm/envs.py +1710 -0
  288. vllm/forward_context.py +358 -0
  289. vllm/inputs/__init__.py +44 -0
  290. vllm/inputs/data.py +359 -0
  291. vllm/inputs/parse.py +137 -0
  292. vllm/inputs/preprocess.py +716 -0
  293. vllm/logger.py +298 -0
  294. vllm/logging_utils/__init__.py +13 -0
  295. vllm/logging_utils/dump_input.py +83 -0
  296. vllm/logging_utils/formatter.py +127 -0
  297. vllm/logging_utils/lazy.py +20 -0
  298. vllm/logging_utils/log_time.py +34 -0
  299. vllm/logits_process.py +121 -0
  300. vllm/logprobs.py +206 -0
  301. vllm/lora/__init__.py +0 -0
  302. vllm/lora/layers/__init__.py +42 -0
  303. vllm/lora/layers/base.py +66 -0
  304. vllm/lora/layers/base_linear.py +165 -0
  305. vllm/lora/layers/column_parallel_linear.py +577 -0
  306. vllm/lora/layers/fused_moe.py +747 -0
  307. vllm/lora/layers/logits_processor.py +203 -0
  308. vllm/lora/layers/replicated_linear.py +70 -0
  309. vllm/lora/layers/row_parallel_linear.py +176 -0
  310. vllm/lora/layers/utils.py +74 -0
  311. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  312. vllm/lora/lora_weights.py +227 -0
  313. vllm/lora/models.py +903 -0
  314. vllm/lora/ops/__init__.py +0 -0
  315. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  316. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  317. vllm/lora/ops/torch_ops/__init__.py +20 -0
  318. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  319. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  320. vllm/lora/ops/triton_ops/__init__.py +21 -0
  321. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +661 -0
  322. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  323. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  324. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  325. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  326. vllm/lora/ops/triton_ops/utils.py +295 -0
  327. vllm/lora/ops/xla_ops/__init__.py +6 -0
  328. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  329. vllm/lora/peft_helper.py +128 -0
  330. vllm/lora/punica_wrapper/__init__.py +10 -0
  331. vllm/lora/punica_wrapper/punica_base.py +493 -0
  332. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  333. vllm/lora/punica_wrapper/punica_gpu.py +412 -0
  334. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  335. vllm/lora/punica_wrapper/punica_tpu.py +358 -0
  336. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  337. vllm/lora/punica_wrapper/utils.py +150 -0
  338. vllm/lora/request.py +100 -0
  339. vllm/lora/resolver.py +88 -0
  340. vllm/lora/utils.py +306 -0
  341. vllm/lora/worker_manager.py +268 -0
  342. vllm/model_executor/__init__.py +11 -0
  343. vllm/model_executor/custom_op.py +194 -0
  344. vllm/model_executor/layers/__init__.py +0 -0
  345. vllm/model_executor/layers/activation.py +595 -0
  346. vllm/model_executor/layers/attention_layer_base.py +32 -0
  347. vllm/model_executor/layers/batch_invariant.py +1058 -0
  348. vllm/model_executor/layers/conv.py +256 -0
  349. vllm/model_executor/layers/fla/__init__.py +8 -0
  350. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  351. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  352. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  353. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  354. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  355. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  356. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  357. vllm/model_executor/layers/fla/ops/index.py +41 -0
  358. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  359. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  360. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  361. vllm/model_executor/layers/fla/ops/op.py +60 -0
  362. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  363. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  364. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  365. vllm/model_executor/layers/fused_moe/__init__.py +110 -0
  366. vllm/model_executor/layers/fused_moe/all2all_utils.py +171 -0
  367. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +406 -0
  368. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +180 -0
  369. vllm/model_executor/layers/fused_moe/config.py +938 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  645. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +292 -0
  646. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1052 -0
  647. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +387 -0
  648. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +416 -0
  649. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  650. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +434 -0
  651. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +376 -0
  652. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  653. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  654. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  655. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  656. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +821 -0
  657. vllm/model_executor/layers/fused_moe/fused_moe.py +2172 -0
  658. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +121 -0
  659. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +136 -0
  660. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +524 -0
  661. vllm/model_executor/layers/fused_moe/layer.py +2152 -0
  662. vllm/model_executor/layers/fused_moe/modular_kernel.py +1332 -0
  663. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +174 -0
  664. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  665. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  666. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  667. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  668. vllm/model_executor/layers/fused_moe/prepare_finalize.py +78 -0
  669. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  670. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  671. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  672. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  673. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  674. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  675. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +559 -0
  676. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  677. vllm/model_executor/layers/kda.py +442 -0
  678. vllm/model_executor/layers/layernorm.py +442 -0
  679. vllm/model_executor/layers/lightning_attn.py +735 -0
  680. vllm/model_executor/layers/linear.py +1424 -0
  681. vllm/model_executor/layers/logits_processor.py +106 -0
  682. vllm/model_executor/layers/mamba/__init__.py +0 -0
  683. vllm/model_executor/layers/mamba/abstract.py +68 -0
  684. vllm/model_executor/layers/mamba/linear_attn.py +388 -0
  685. vllm/model_executor/layers/mamba/mamba_mixer.py +527 -0
  686. vllm/model_executor/layers/mamba/mamba_mixer2.py +930 -0
  687. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  688. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  689. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  690. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  691. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +478 -0
  692. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  693. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  694. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  695. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  696. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  697. vllm/model_executor/layers/mamba/short_conv.py +255 -0
  698. vllm/model_executor/layers/mla.py +176 -0
  699. vllm/model_executor/layers/pooler.py +817 -0
  700. vllm/model_executor/layers/quantization/__init__.py +179 -0
  701. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  702. vllm/model_executor/layers/quantization/awq.py +277 -0
  703. vllm/model_executor/layers/quantization/awq_marlin.py +718 -0
  704. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  705. vllm/model_executor/layers/quantization/base_config.py +170 -0
  706. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  707. vllm/model_executor/layers/quantization/bitsandbytes.py +644 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +963 -0
  710. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2387 -0
  711. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  712. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  713. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  714. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  715. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  716. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  717. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +183 -0
  718. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  719. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  720. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  721. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  722. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  723. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  724. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  725. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  726. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  727. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  728. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  729. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  730. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  731. vllm/model_executor/layers/quantization/cpu_wna16.py +625 -0
  732. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  733. vllm/model_executor/layers/quantization/experts_int8.py +225 -0
  734. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  735. vllm/model_executor/layers/quantization/fp8.py +1348 -0
  736. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  737. vllm/model_executor/layers/quantization/gguf.py +687 -0
  738. vllm/model_executor/layers/quantization/gptq.py +393 -0
  739. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  740. vllm/model_executor/layers/quantization/gptq_marlin.py +842 -0
  741. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  742. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  743. vllm/model_executor/layers/quantization/inc.py +65 -0
  744. vllm/model_executor/layers/quantization/input_quant_fp8.py +171 -0
  745. vllm/model_executor/layers/quantization/ipex_quant.py +470 -0
  746. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  747. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  748. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +105 -0
  749. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  750. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  751. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  752. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +119 -0
  753. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  754. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  755. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  756. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  757. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +73 -0
  758. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +97 -0
  759. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  760. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +219 -0
  761. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +140 -0
  762. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +42 -0
  763. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  764. vllm/model_executor/layers/quantization/kv_cache.py +146 -0
  765. vllm/model_executor/layers/quantization/modelopt.py +1637 -0
  766. vllm/model_executor/layers/quantization/moe_wna16.py +528 -0
  767. vllm/model_executor/layers/quantization/mxfp4.py +1175 -0
  768. vllm/model_executor/layers/quantization/petit.py +319 -0
  769. vllm/model_executor/layers/quantization/ptpc_fp8.py +136 -0
  770. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  771. vllm/model_executor/layers/quantization/quark/quark.py +527 -0
  772. vllm/model_executor/layers/quantization/quark/quark_moe.py +653 -0
  773. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  774. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  775. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  776. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  777. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  778. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  779. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  780. vllm/model_executor/layers/quantization/rtn.py +639 -0
  781. vllm/model_executor/layers/quantization/schema.py +90 -0
  782. vllm/model_executor/layers/quantization/torchao.py +380 -0
  783. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  784. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  785. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  786. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1002. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +333 -0
  1003. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +311 -0
  1004. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1203 -0
  1005. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1006. vllm/model_executor/layers/quantization/utils/int8_utils.py +489 -0
  1007. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1008. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1009. vllm/model_executor/layers/quantization/utils/marlin_utils.py +674 -0
  1010. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +452 -0
  1011. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1012. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1013. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1014. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +183 -0
  1015. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1016. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1017. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1018. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1019. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1020. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1021. vllm/model_executor/layers/quantization/utils/quant_utils.py +687 -0
  1022. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +516 -0
  1023. vllm/model_executor/layers/resampler.py +283 -0
  1024. vllm/model_executor/layers/rotary_embedding/__init__.py +292 -0
  1025. vllm/model_executor/layers/rotary_embedding/base.py +240 -0
  1026. vllm/model_executor/layers/rotary_embedding/common.py +188 -0
  1027. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1028. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1029. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1030. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1031. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +75 -0
  1032. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1033. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1034. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1035. vllm/model_executor/layers/rotary_embedding/mrope.py +397 -0
  1036. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1037. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1038. vllm/model_executor/layers/rotary_embedding/xdrope.py +102 -0
  1039. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1040. vllm/model_executor/layers/utils.py +251 -0
  1041. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1042. vllm/model_executor/model_loader/__init__.py +150 -0
  1043. vllm/model_executor/model_loader/base_loader.py +57 -0
  1044. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1045. vllm/model_executor/model_loader/default_loader.py +321 -0
  1046. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1047. vllm/model_executor/model_loader/gguf_loader.py +349 -0
  1048. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1049. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1050. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1051. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1052. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1053. vllm/model_executor/model_loader/tpu.py +118 -0
  1054. vllm/model_executor/model_loader/utils.py +296 -0
  1055. vllm/model_executor/model_loader/weight_utils.py +1147 -0
  1056. vllm/model_executor/models/__init__.py +44 -0
  1057. vllm/model_executor/models/adapters.py +543 -0
  1058. vllm/model_executor/models/afmoe.py +697 -0
  1059. vllm/model_executor/models/aimv2.py +248 -0
  1060. vllm/model_executor/models/apertus.py +569 -0
  1061. vllm/model_executor/models/arcee.py +428 -0
  1062. vllm/model_executor/models/arctic.py +634 -0
  1063. vllm/model_executor/models/aria.py +655 -0
  1064. vllm/model_executor/models/aya_vision.py +450 -0
  1065. vllm/model_executor/models/baichuan.py +494 -0
  1066. vllm/model_executor/models/bailing_moe.py +645 -0
  1067. vllm/model_executor/models/bamba.py +516 -0
  1068. vllm/model_executor/models/bee.py +157 -0
  1069. vllm/model_executor/models/bert.py +925 -0
  1070. vllm/model_executor/models/bert_with_rope.py +732 -0
  1071. vllm/model_executor/models/blip.py +350 -0
  1072. vllm/model_executor/models/blip2.py +695 -0
  1073. vllm/model_executor/models/bloom.py +390 -0
  1074. vllm/model_executor/models/chameleon.py +1098 -0
  1075. vllm/model_executor/models/chatglm.py +499 -0
  1076. vllm/model_executor/models/clip.py +1005 -0
  1077. vllm/model_executor/models/cohere2_vision.py +472 -0
  1078. vllm/model_executor/models/commandr.py +470 -0
  1079. vllm/model_executor/models/config.py +510 -0
  1080. vllm/model_executor/models/dbrx.py +485 -0
  1081. vllm/model_executor/models/deepencoder.py +676 -0
  1082. vllm/model_executor/models/deepseek_eagle.py +252 -0
  1083. vllm/model_executor/models/deepseek_mtp.py +446 -0
  1084. vllm/model_executor/models/deepseek_ocr.py +593 -0
  1085. vllm/model_executor/models/deepseek_v2.py +1715 -0
  1086. vllm/model_executor/models/deepseek_vl2.py +644 -0
  1087. vllm/model_executor/models/dots1.py +566 -0
  1088. vllm/model_executor/models/dots_ocr.py +874 -0
  1089. vllm/model_executor/models/ernie45.py +53 -0
  1090. vllm/model_executor/models/ernie45_moe.py +755 -0
  1091. vllm/model_executor/models/ernie45_vl.py +1710 -0
  1092. vllm/model_executor/models/ernie45_vl_moe.py +800 -0
  1093. vllm/model_executor/models/ernie_mtp.py +279 -0
  1094. vllm/model_executor/models/exaone.py +525 -0
  1095. vllm/model_executor/models/exaone4.py +517 -0
  1096. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1097. vllm/model_executor/models/falcon.py +544 -0
  1098. vllm/model_executor/models/falcon_h1.py +680 -0
  1099. vllm/model_executor/models/flex_olmo.py +155 -0
  1100. vllm/model_executor/models/fuyu.py +373 -0
  1101. vllm/model_executor/models/gemma.py +426 -0
  1102. vllm/model_executor/models/gemma2.py +436 -0
  1103. vllm/model_executor/models/gemma3.py +577 -0
  1104. vllm/model_executor/models/gemma3_mm.py +665 -0
  1105. vllm/model_executor/models/gemma3n.py +1167 -0
  1106. vllm/model_executor/models/gemma3n_mm.py +811 -0
  1107. vllm/model_executor/models/glm.py +23 -0
  1108. vllm/model_executor/models/glm4.py +298 -0
  1109. vllm/model_executor/models/glm4_1v.py +1854 -0
  1110. vllm/model_executor/models/glm4_moe.py +738 -0
  1111. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1112. vllm/model_executor/models/glm4v.py +785 -0
  1113. vllm/model_executor/models/gpt2.py +397 -0
  1114. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1115. vllm/model_executor/models/gpt_j.py +345 -0
  1116. vllm/model_executor/models/gpt_neox.py +343 -0
  1117. vllm/model_executor/models/gpt_oss.py +745 -0
  1118. vllm/model_executor/models/granite.py +476 -0
  1119. vllm/model_executor/models/granite_speech.py +913 -0
  1120. vllm/model_executor/models/granitemoe.py +561 -0
  1121. vllm/model_executor/models/granitemoehybrid.py +704 -0
  1122. vllm/model_executor/models/granitemoeshared.py +328 -0
  1123. vllm/model_executor/models/gritlm.py +245 -0
  1124. vllm/model_executor/models/grok1.py +555 -0
  1125. vllm/model_executor/models/h2ovl.py +554 -0
  1126. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1127. vllm/model_executor/models/hunyuan_vision.py +1028 -0
  1128. vllm/model_executor/models/hyperclovax_vision.py +1166 -0
  1129. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1130. vllm/model_executor/models/idefics3.py +718 -0
  1131. vllm/model_executor/models/interfaces.py +1148 -0
  1132. vllm/model_executor/models/interfaces_base.py +243 -0
  1133. vllm/model_executor/models/intern_vit.py +454 -0
  1134. vllm/model_executor/models/internlm2.py +454 -0
  1135. vllm/model_executor/models/internlm2_ve.py +139 -0
  1136. vllm/model_executor/models/interns1.py +830 -0
  1137. vllm/model_executor/models/interns1_vit.py +433 -0
  1138. vllm/model_executor/models/internvl.py +1452 -0
  1139. vllm/model_executor/models/jais.py +397 -0
  1140. vllm/model_executor/models/jamba.py +609 -0
  1141. vllm/model_executor/models/jina_vl.py +147 -0
  1142. vllm/model_executor/models/keye.py +1765 -0
  1143. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1144. vllm/model_executor/models/kimi_linear.py +658 -0
  1145. vllm/model_executor/models/kimi_vl.py +578 -0
  1146. vllm/model_executor/models/lfm2.py +516 -0
  1147. vllm/model_executor/models/lfm2_moe.py +746 -0
  1148. vllm/model_executor/models/lightonocr.py +195 -0
  1149. vllm/model_executor/models/llama.py +704 -0
  1150. vllm/model_executor/models/llama4.py +857 -0
  1151. vllm/model_executor/models/llama4_eagle.py +216 -0
  1152. vllm/model_executor/models/llama_eagle.py +213 -0
  1153. vllm/model_executor/models/llama_eagle3.py +375 -0
  1154. vllm/model_executor/models/llava.py +842 -0
  1155. vllm/model_executor/models/llava_next.py +583 -0
  1156. vllm/model_executor/models/llava_next_video.py +467 -0
  1157. vllm/model_executor/models/llava_onevision.py +923 -0
  1158. vllm/model_executor/models/longcat_flash.py +743 -0
  1159. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1160. vllm/model_executor/models/mamba.py +276 -0
  1161. vllm/model_executor/models/mamba2.py +288 -0
  1162. vllm/model_executor/models/medusa.py +179 -0
  1163. vllm/model_executor/models/midashenglm.py +828 -0
  1164. vllm/model_executor/models/mimo.py +188 -0
  1165. vllm/model_executor/models/mimo_mtp.py +294 -0
  1166. vllm/model_executor/models/minicpm.py +657 -0
  1167. vllm/model_executor/models/minicpm3.py +234 -0
  1168. vllm/model_executor/models/minicpm_eagle.py +385 -0
  1169. vllm/model_executor/models/minicpmo.py +768 -0
  1170. vllm/model_executor/models/minicpmv.py +1744 -0
  1171. vllm/model_executor/models/minimax_m2.py +546 -0
  1172. vllm/model_executor/models/minimax_text_01.py +1010 -0
  1173. vllm/model_executor/models/minimax_vl_01.py +396 -0
  1174. vllm/model_executor/models/mistral3.py +637 -0
  1175. vllm/model_executor/models/mistral_large_3.py +63 -0
  1176. vllm/model_executor/models/mistral_large_3_eagle.py +165 -0
  1177. vllm/model_executor/models/mixtral.py +599 -0
  1178. vllm/model_executor/models/mllama4.py +1151 -0
  1179. vllm/model_executor/models/mlp_speculator.py +235 -0
  1180. vllm/model_executor/models/modernbert.py +452 -0
  1181. vllm/model_executor/models/module_mapping.py +74 -0
  1182. vllm/model_executor/models/molmo.py +1553 -0
  1183. vllm/model_executor/models/moonvit.py +686 -0
  1184. vllm/model_executor/models/mpt.py +335 -0
  1185. vllm/model_executor/models/nano_nemotron_vl.py +1732 -0
  1186. vllm/model_executor/models/nemotron.py +502 -0
  1187. vllm/model_executor/models/nemotron_h.py +850 -0
  1188. vllm/model_executor/models/nemotron_nas.py +473 -0
  1189. vllm/model_executor/models/nemotron_vl.py +653 -0
  1190. vllm/model_executor/models/nvlm_d.py +216 -0
  1191. vllm/model_executor/models/olmo.py +413 -0
  1192. vllm/model_executor/models/olmo2.py +455 -0
  1193. vllm/model_executor/models/olmoe.py +494 -0
  1194. vllm/model_executor/models/opencua.py +271 -0
  1195. vllm/model_executor/models/openpangu.py +1051 -0
  1196. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1197. vllm/model_executor/models/opt.py +426 -0
  1198. vllm/model_executor/models/orion.py +366 -0
  1199. vllm/model_executor/models/ouro.py +508 -0
  1200. vllm/model_executor/models/ovis.py +559 -0
  1201. vllm/model_executor/models/ovis2_5.py +673 -0
  1202. vllm/model_executor/models/paddleocr_vl.py +1380 -0
  1203. vllm/model_executor/models/paligemma.py +412 -0
  1204. vllm/model_executor/models/persimmon.py +376 -0
  1205. vllm/model_executor/models/phi.py +370 -0
  1206. vllm/model_executor/models/phi3.py +18 -0
  1207. vllm/model_executor/models/phi3v.py +737 -0
  1208. vllm/model_executor/models/phi4_multimodal.py +1447 -0
  1209. vllm/model_executor/models/phi4mm.py +1253 -0
  1210. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1211. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1212. vllm/model_executor/models/phimoe.py +670 -0
  1213. vllm/model_executor/models/pixtral.py +1380 -0
  1214. vllm/model_executor/models/plamo2.py +966 -0
  1215. vllm/model_executor/models/plamo3.py +441 -0
  1216. vllm/model_executor/models/qwen.py +363 -0
  1217. vllm/model_executor/models/qwen2.py +569 -0
  1218. vllm/model_executor/models/qwen2_5_omni_thinker.py +1220 -0
  1219. vllm/model_executor/models/qwen2_5_vl.py +1594 -0
  1220. vllm/model_executor/models/qwen2_audio.py +473 -0
  1221. vllm/model_executor/models/qwen2_moe.py +590 -0
  1222. vllm/model_executor/models/qwen2_rm.py +123 -0
  1223. vllm/model_executor/models/qwen2_vl.py +1593 -0
  1224. vllm/model_executor/models/qwen3.py +332 -0
  1225. vllm/model_executor/models/qwen3_moe.py +738 -0
  1226. vllm/model_executor/models/qwen3_next.py +1390 -0
  1227. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1228. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1765 -0
  1229. vllm/model_executor/models/qwen3_vl.py +1686 -0
  1230. vllm/model_executor/models/qwen3_vl_moe.py +470 -0
  1231. vllm/model_executor/models/qwen_vl.py +803 -0
  1232. vllm/model_executor/models/radio.py +555 -0
  1233. vllm/model_executor/models/registry.py +1183 -0
  1234. vllm/model_executor/models/roberta.py +259 -0
  1235. vllm/model_executor/models/rvl.py +107 -0
  1236. vllm/model_executor/models/seed_oss.py +493 -0
  1237. vllm/model_executor/models/siglip.py +1245 -0
  1238. vllm/model_executor/models/siglip2navit.py +723 -0
  1239. vllm/model_executor/models/skyworkr1v.py +953 -0
  1240. vllm/model_executor/models/smolvlm.py +38 -0
  1241. vllm/model_executor/models/solar.py +485 -0
  1242. vllm/model_executor/models/stablelm.py +359 -0
  1243. vllm/model_executor/models/starcoder2.py +366 -0
  1244. vllm/model_executor/models/step3_text.py +555 -0
  1245. vllm/model_executor/models/step3_vl.py +1149 -0
  1246. vllm/model_executor/models/swin.py +514 -0
  1247. vllm/model_executor/models/tarsier.py +619 -0
  1248. vllm/model_executor/models/telechat2.py +153 -0
  1249. vllm/model_executor/models/teleflm.py +78 -0
  1250. vllm/model_executor/models/terratorch.py +319 -0
  1251. vllm/model_executor/models/transformers/__init__.py +127 -0
  1252. vllm/model_executor/models/transformers/base.py +464 -0
  1253. vllm/model_executor/models/transformers/causal.py +65 -0
  1254. vllm/model_executor/models/transformers/legacy.py +90 -0
  1255. vllm/model_executor/models/transformers/moe.py +325 -0
  1256. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1257. vllm/model_executor/models/transformers/pooling.py +119 -0
  1258. vllm/model_executor/models/transformers/utils.py +213 -0
  1259. vllm/model_executor/models/ultravox.py +686 -0
  1260. vllm/model_executor/models/utils.py +832 -0
  1261. vllm/model_executor/models/vision.py +552 -0
  1262. vllm/model_executor/models/voxtral.py +842 -0
  1263. vllm/model_executor/models/whisper.py +963 -0
  1264. vllm/model_executor/models/zamba2.py +980 -0
  1265. vllm/model_executor/parameter.py +642 -0
  1266. vllm/model_executor/utils.py +94 -0
  1267. vllm/model_executor/warmup/__init__.py +0 -0
  1268. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1269. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1270. vllm/multimodal/__init__.py +40 -0
  1271. vllm/multimodal/audio.py +142 -0
  1272. vllm/multimodal/base.py +26 -0
  1273. vllm/multimodal/cache.py +830 -0
  1274. vllm/multimodal/evs.py +294 -0
  1275. vllm/multimodal/hasher.py +106 -0
  1276. vllm/multimodal/image.py +130 -0
  1277. vllm/multimodal/inputs.py +1036 -0
  1278. vllm/multimodal/parse.py +544 -0
  1279. vllm/multimodal/processing.py +2240 -0
  1280. vllm/multimodal/profiling.py +369 -0
  1281. vllm/multimodal/registry.py +357 -0
  1282. vllm/multimodal/utils.py +523 -0
  1283. vllm/multimodal/video.py +333 -0
  1284. vllm/outputs.py +345 -0
  1285. vllm/platforms/__init__.py +277 -0
  1286. vllm/platforms/cpu.py +410 -0
  1287. vllm/platforms/cuda.py +642 -0
  1288. vllm/platforms/interface.py +656 -0
  1289. vllm/platforms/rocm.py +513 -0
  1290. vllm/platforms/tpu.py +275 -0
  1291. vllm/platforms/xpu.py +261 -0
  1292. vllm/plugins/__init__.py +81 -0
  1293. vllm/plugins/io_processors/__init__.py +68 -0
  1294. vllm/plugins/io_processors/interface.py +77 -0
  1295. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1296. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1297. vllm/pooling_params.py +230 -0
  1298. vllm/profiler/__init__.py +0 -0
  1299. vllm/profiler/gpu_profiler.py +216 -0
  1300. vllm/profiler/layerwise_profile.py +392 -0
  1301. vllm/profiler/utils.py +151 -0
  1302. vllm/py.typed +2 -0
  1303. vllm/ray/__init__.py +0 -0
  1304. vllm/ray/lazy_utils.py +30 -0
  1305. vllm/ray/ray_env.py +79 -0
  1306. vllm/reasoning/__init__.py +92 -0
  1307. vllm/reasoning/abs_reasoning_parsers.py +290 -0
  1308. vllm/reasoning/basic_parsers.py +162 -0
  1309. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1310. vllm/reasoning/deepseek_v3_reasoning_parser.py +62 -0
  1311. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1312. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1313. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1314. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1315. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1316. vllm/reasoning/identity_reasoning_parser.py +58 -0
  1317. vllm/reasoning/minimax_m2_reasoning_parser.py +67 -0
  1318. vllm/reasoning/mistral_reasoning_parser.py +55 -0
  1319. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1320. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1321. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1322. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1323. vllm/sampling_params.py +597 -0
  1324. vllm/scalar_type.py +355 -0
  1325. vllm/scripts.py +17 -0
  1326. vllm/sequence.py +98 -0
  1327. vllm/tasks.py +13 -0
  1328. vllm/third_party/__init__.py +0 -0
  1329. vllm/third_party/pynvml.py +6140 -0
  1330. vllm/tokenizers/__init__.py +24 -0
  1331. vllm/tokenizers/detokenizer_utils.py +198 -0
  1332. vllm/tokenizers/hf.py +124 -0
  1333. vllm/tokenizers/mistral.py +554 -0
  1334. vllm/tokenizers/protocol.py +111 -0
  1335. vllm/tokenizers/registry.py +233 -0
  1336. vllm/tracing.py +135 -0
  1337. vllm/transformers_utils/__init__.py +26 -0
  1338. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1339. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1340. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1341. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1342. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1343. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1344. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1345. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1346. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1347. vllm/transformers_utils/config.py +1081 -0
  1348. vllm/transformers_utils/config_parser_base.py +20 -0
  1349. vllm/transformers_utils/configs/__init__.py +84 -0
  1350. vllm/transformers_utils/configs/afmoe.py +87 -0
  1351. vllm/transformers_utils/configs/arctic.py +216 -0
  1352. vllm/transformers_utils/configs/chatglm.py +75 -0
  1353. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1354. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1355. vllm/transformers_utils/configs/eagle.py +90 -0
  1356. vllm/transformers_utils/configs/falcon.py +89 -0
  1357. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1358. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1359. vllm/transformers_utils/configs/jais.py +243 -0
  1360. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1361. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1362. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1363. vllm/transformers_utils/configs/medusa.py +65 -0
  1364. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1365. vllm/transformers_utils/configs/mistral.py +235 -0
  1366. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1367. vllm/transformers_utils/configs/moonvit.py +33 -0
  1368. vllm/transformers_utils/configs/nemotron.py +214 -0
  1369. vllm/transformers_utils/configs/nemotron_h.py +282 -0
  1370. vllm/transformers_utils/configs/olmo3.py +83 -0
  1371. vllm/transformers_utils/configs/ovis.py +182 -0
  1372. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1373. vllm/transformers_utils/configs/radio.py +89 -0
  1374. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1375. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1376. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1377. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1378. vllm/transformers_utils/configs/ultravox.py +118 -0
  1379. vllm/transformers_utils/dynamic_module.py +59 -0
  1380. vllm/transformers_utils/gguf_utils.py +209 -0
  1381. vllm/transformers_utils/processor.py +423 -0
  1382. vllm/transformers_utils/processors/__init__.py +23 -0
  1383. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1384. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1385. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1386. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1387. vllm/transformers_utils/processors/ovis.py +453 -0
  1388. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1389. vllm/transformers_utils/repo_utils.py +287 -0
  1390. vllm/transformers_utils/runai_utils.py +104 -0
  1391. vllm/transformers_utils/s3_utils.py +95 -0
  1392. vllm/transformers_utils/tokenizer.py +127 -0
  1393. vllm/transformers_utils/tokenizer_base.py +33 -0
  1394. vllm/transformers_utils/utils.py +184 -0
  1395. vllm/triton_utils/__init__.py +20 -0
  1396. vllm/triton_utils/importing.py +103 -0
  1397. vllm/usage/__init__.py +0 -0
  1398. vllm/usage/usage_lib.py +294 -0
  1399. vllm/utils/__init__.py +66 -0
  1400. vllm/utils/argparse_utils.py +504 -0
  1401. vllm/utils/async_utils.py +310 -0
  1402. vllm/utils/cache.py +214 -0
  1403. vllm/utils/collection_utils.py +112 -0
  1404. vllm/utils/counter.py +45 -0
  1405. vllm/utils/deep_gemm.py +399 -0
  1406. vllm/utils/flashinfer.py +532 -0
  1407. vllm/utils/func_utils.py +236 -0
  1408. vllm/utils/gc_utils.py +151 -0
  1409. vllm/utils/hashing.py +81 -0
  1410. vllm/utils/import_utils.py +449 -0
  1411. vllm/utils/jsontree.py +158 -0
  1412. vllm/utils/math_utils.py +32 -0
  1413. vllm/utils/mem_constants.py +13 -0
  1414. vllm/utils/mem_utils.py +232 -0
  1415. vllm/utils/nccl.py +64 -0
  1416. vllm/utils/network_utils.py +331 -0
  1417. vllm/utils/platform_utils.py +59 -0
  1418. vllm/utils/profiling.py +56 -0
  1419. vllm/utils/registry.py +51 -0
  1420. vllm/utils/serial_utils.py +169 -0
  1421. vllm/utils/system_utils.py +265 -0
  1422. vllm/utils/tensor_schema.py +255 -0
  1423. vllm/utils/torch_utils.py +647 -0
  1424. vllm/v1/__init__.py +0 -0
  1425. vllm/v1/attention/__init__.py +0 -0
  1426. vllm/v1/attention/backends/__init__.py +0 -0
  1427. vllm/v1/attention/backends/cpu_attn.py +497 -0
  1428. vllm/v1/attention/backends/flash_attn.py +1050 -0
  1429. vllm/v1/attention/backends/flashinfer.py +1572 -0
  1430. vllm/v1/attention/backends/flex_attention.py +945 -0
  1431. vllm/v1/attention/backends/gdn_attn.py +387 -0
  1432. vllm/v1/attention/backends/linear_attn.py +77 -0
  1433. vllm/v1/attention/backends/mamba1_attn.py +165 -0
  1434. vllm/v1/attention/backends/mamba2_attn.py +354 -0
  1435. vllm/v1/attention/backends/mamba_attn.py +117 -0
  1436. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1437. vllm/v1/attention/backends/mla/aiter_triton_mla.py +74 -0
  1438. vllm/v1/attention/backends/mla/common.py +2069 -0
  1439. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1440. vllm/v1/attention/backends/mla/flashattn_mla.py +340 -0
  1441. vllm/v1/attention/backends/mla/flashinfer_mla.py +174 -0
  1442. vllm/v1/attention/backends/mla/flashmla.py +317 -0
  1443. vllm/v1/attention/backends/mla/flashmla_sparse.py +551 -0
  1444. vllm/v1/attention/backends/mla/indexer.py +369 -0
  1445. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +275 -0
  1446. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +325 -0
  1447. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1448. vllm/v1/attention/backends/pallas.py +436 -0
  1449. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1450. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1451. vllm/v1/attention/backends/rocm_attn.py +359 -0
  1452. vllm/v1/attention/backends/short_conv_attn.py +105 -0
  1453. vllm/v1/attention/backends/tree_attn.py +428 -0
  1454. vllm/v1/attention/backends/triton_attn.py +377 -0
  1455. vllm/v1/attention/backends/utils.py +1149 -0
  1456. vllm/v1/core/__init__.py +0 -0
  1457. vllm/v1/core/block_pool.py +466 -0
  1458. vllm/v1/core/encoder_cache_manager.py +343 -0
  1459. vllm/v1/core/kv_cache_coordinator.py +570 -0
  1460. vllm/v1/core/kv_cache_manager.py +408 -0
  1461. vllm/v1/core/kv_cache_metrics.py +96 -0
  1462. vllm/v1/core/kv_cache_utils.py +1471 -0
  1463. vllm/v1/core/sched/__init__.py +0 -0
  1464. vllm/v1/core/sched/async_scheduler.py +68 -0
  1465. vllm/v1/core/sched/interface.py +187 -0
  1466. vllm/v1/core/sched/output.py +230 -0
  1467. vllm/v1/core/sched/request_queue.py +217 -0
  1468. vllm/v1/core/sched/scheduler.py +1726 -0
  1469. vllm/v1/core/sched/utils.py +72 -0
  1470. vllm/v1/core/single_type_kv_cache_manager.py +801 -0
  1471. vllm/v1/cudagraph_dispatcher.py +183 -0
  1472. vllm/v1/engine/__init__.py +214 -0
  1473. vllm/v1/engine/async_llm.py +874 -0
  1474. vllm/v1/engine/coordinator.py +377 -0
  1475. vllm/v1/engine/core.py +1421 -0
  1476. vllm/v1/engine/core_client.py +1406 -0
  1477. vllm/v1/engine/detokenizer.py +351 -0
  1478. vllm/v1/engine/exceptions.py +18 -0
  1479. vllm/v1/engine/input_processor.py +636 -0
  1480. vllm/v1/engine/llm_engine.py +416 -0
  1481. vllm/v1/engine/logprobs.py +189 -0
  1482. vllm/v1/engine/output_processor.py +658 -0
  1483. vllm/v1/engine/parallel_sampling.py +145 -0
  1484. vllm/v1/engine/processor.py +20 -0
  1485. vllm/v1/engine/utils.py +1068 -0
  1486. vllm/v1/executor/__init__.py +6 -0
  1487. vllm/v1/executor/abstract.py +352 -0
  1488. vllm/v1/executor/multiproc_executor.py +888 -0
  1489. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1490. vllm/v1/executor/ray_executor.py +626 -0
  1491. vllm/v1/executor/ray_utils.py +465 -0
  1492. vllm/v1/executor/uniproc_executor.py +183 -0
  1493. vllm/v1/kv_cache_interface.py +404 -0
  1494. vllm/v1/kv_offload/__init__.py +0 -0
  1495. vllm/v1/kv_offload/abstract.py +161 -0
  1496. vllm/v1/kv_offload/arc_manager.py +237 -0
  1497. vllm/v1/kv_offload/backend.py +97 -0
  1498. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1499. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1500. vllm/v1/kv_offload/cpu.py +86 -0
  1501. vllm/v1/kv_offload/factory.py +56 -0
  1502. vllm/v1/kv_offload/lru_manager.py +139 -0
  1503. vllm/v1/kv_offload/mediums.py +39 -0
  1504. vllm/v1/kv_offload/spec.py +66 -0
  1505. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1506. vllm/v1/kv_offload/worker/cpu_gpu.py +191 -0
  1507. vllm/v1/kv_offload/worker/worker.py +144 -0
  1508. vllm/v1/metrics/__init__.py +0 -0
  1509. vllm/v1/metrics/loggers.py +1268 -0
  1510. vllm/v1/metrics/prometheus.py +82 -0
  1511. vllm/v1/metrics/ray_wrappers.py +194 -0
  1512. vllm/v1/metrics/reader.py +257 -0
  1513. vllm/v1/metrics/stats.py +431 -0
  1514. vllm/v1/outputs.py +237 -0
  1515. vllm/v1/pool/__init__.py +0 -0
  1516. vllm/v1/pool/metadata.py +82 -0
  1517. vllm/v1/request.py +280 -0
  1518. vllm/v1/sample/__init__.py +0 -0
  1519. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1520. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1521. vllm/v1/sample/logits_processor/interface.py +106 -0
  1522. vllm/v1/sample/logits_processor/state.py +165 -0
  1523. vllm/v1/sample/metadata.py +44 -0
  1524. vllm/v1/sample/ops/__init__.py +0 -0
  1525. vllm/v1/sample/ops/bad_words.py +52 -0
  1526. vllm/v1/sample/ops/logprobs.py +25 -0
  1527. vllm/v1/sample/ops/penalties.py +57 -0
  1528. vllm/v1/sample/ops/topk_topp_sampler.py +384 -0
  1529. vllm/v1/sample/rejection_sampler.py +805 -0
  1530. vllm/v1/sample/sampler.py +319 -0
  1531. vllm/v1/sample/tpu/__init__.py +0 -0
  1532. vllm/v1/sample/tpu/metadata.py +120 -0
  1533. vllm/v1/sample/tpu/sampler.py +215 -0
  1534. vllm/v1/serial_utils.py +532 -0
  1535. vllm/v1/spec_decode/__init__.py +0 -0
  1536. vllm/v1/spec_decode/eagle.py +1325 -0
  1537. vllm/v1/spec_decode/medusa.py +73 -0
  1538. vllm/v1/spec_decode/metadata.py +66 -0
  1539. vllm/v1/spec_decode/metrics.py +225 -0
  1540. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1541. vllm/v1/spec_decode/suffix_decoding.py +101 -0
  1542. vllm/v1/spec_decode/utils.py +121 -0
  1543. vllm/v1/structured_output/__init__.py +338 -0
  1544. vllm/v1/structured_output/backend_guidance.py +265 -0
  1545. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1546. vllm/v1/structured_output/backend_outlines.py +324 -0
  1547. vllm/v1/structured_output/backend_types.py +136 -0
  1548. vllm/v1/structured_output/backend_xgrammar.py +362 -0
  1549. vllm/v1/structured_output/request.py +94 -0
  1550. vllm/v1/structured_output/utils.py +469 -0
  1551. vllm/v1/utils.py +414 -0
  1552. vllm/v1/worker/__init__.py +0 -0
  1553. vllm/v1/worker/block_table.py +343 -0
  1554. vllm/v1/worker/cpu_model_runner.py +122 -0
  1555. vllm/v1/worker/cpu_worker.py +210 -0
  1556. vllm/v1/worker/dp_utils.py +250 -0
  1557. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1558. vllm/v1/worker/gpu/README.md +4 -0
  1559. vllm/v1/worker/gpu/__init__.py +0 -0
  1560. vllm/v1/worker/gpu/async_utils.py +97 -0
  1561. vllm/v1/worker/gpu/attn_utils.py +189 -0
  1562. vllm/v1/worker/gpu/block_table.py +314 -0
  1563. vllm/v1/worker/gpu/cudagraph_utils.py +259 -0
  1564. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1565. vllm/v1/worker/gpu/input_batch.py +430 -0
  1566. vllm/v1/worker/gpu/model_runner.py +1007 -0
  1567. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1568. vllm/v1/worker/gpu/sample/gumbel.py +101 -0
  1569. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1570. vllm/v1/worker/gpu/sample/metadata.py +179 -0
  1571. vllm/v1/worker/gpu/sample/penalties.py +154 -0
  1572. vllm/v1/worker/gpu/sample/sampler.py +75 -0
  1573. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1574. vllm/v1/worker/gpu/spec_decode/eagle.py +565 -0
  1575. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1576. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +83 -0
  1577. vllm/v1/worker/gpu/states.py +309 -0
  1578. vllm/v1/worker/gpu/structured_outputs.py +76 -0
  1579. vllm/v1/worker/gpu_input_batch.py +971 -0
  1580. vllm/v1/worker/gpu_model_runner.py +5360 -0
  1581. vllm/v1/worker/gpu_ubatch_wrapper.py +472 -0
  1582. vllm/v1/worker/gpu_worker.py +922 -0
  1583. vllm/v1/worker/kv_connector_model_runner_mixin.py +309 -0
  1584. vllm/v1/worker/lora_model_runner_mixin.py +212 -0
  1585. vllm/v1/worker/tpu_input_batch.py +583 -0
  1586. vllm/v1/worker/tpu_model_runner.py +2196 -0
  1587. vllm/v1/worker/tpu_worker.py +351 -0
  1588. vllm/v1/worker/ubatch_utils.py +73 -0
  1589. vllm/v1/worker/ubatching.py +231 -0
  1590. vllm/v1/worker/utils.py +365 -0
  1591. vllm/v1/worker/worker_base.py +377 -0
  1592. vllm/v1/worker/xpu_model_runner.py +48 -0
  1593. vllm/v1/worker/xpu_worker.py +198 -0
  1594. vllm/version.py +39 -0
  1595. vllm/vllm_flash_attn/.gitkeep +0 -0
  1596. vllm_cpu-0.12.0.dist-info/METADATA +300 -0
  1597. vllm_cpu-0.12.0.dist-info/RECORD +1600 -0
  1598. vllm_cpu-0.12.0.dist-info/WHEEL +5 -0
  1599. vllm_cpu-0.12.0.dist-info/entry_points.txt +5 -0
  1600. vllm_cpu-0.12.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2480 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ import contextlib
4
+ import copy
5
+ import logging
6
+ import math
7
+ import queue
8
+ import threading
9
+ import time
10
+ import uuid
11
+ from collections import defaultdict
12
+ from collections.abc import Iterator
13
+ from concurrent.futures import Future, ThreadPoolExecutor
14
+ from dataclasses import dataclass
15
+ from typing import TYPE_CHECKING, Any, Optional
16
+
17
+ import msgspec
18
+ import numpy as np
19
+ import torch
20
+ import zmq
21
+
22
+ from vllm import envs
23
+ from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
24
+ from vllm.attention.backends.registry import AttentionBackendEnum
25
+ from vllm.attention.selector import get_attn_backend
26
+ from vllm.config import VllmConfig
27
+ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
28
+ CopyBlocksOp,
29
+ KVConnectorBase_V1,
30
+ KVConnectorHandshakeMetadata,
31
+ KVConnectorMetadata,
32
+ KVConnectorRole,
33
+ )
34
+ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
35
+ KVConnectorPromMetrics,
36
+ KVConnectorStats,
37
+ PromMetric,
38
+ PromMetricT,
39
+ )
40
+ from vllm.distributed.parallel_state import (
41
+ get_tensor_model_parallel_rank,
42
+ get_tensor_model_parallel_world_size,
43
+ get_tp_group,
44
+ )
45
+ from vllm.forward_context import ForwardContext
46
+ from vllm.logger import init_logger
47
+ from vllm.platforms import current_platform
48
+ from vllm.utils.network_utils import make_zmq_path, make_zmq_socket
49
+ from vllm.v1.attention.backends.utils import get_kv_cache_layout
50
+ from vllm.v1.core.sched.output import SchedulerOutput
51
+ from vllm.v1.worker.block_table import BlockTable
52
+
53
+ if TYPE_CHECKING:
54
+ from vllm.v1.core.kv_cache_manager import KVCacheBlocks
55
+ from vllm.v1.kv_cache_interface import KVCacheConfig
56
+ from vllm.v1.request import Request
57
+
58
+ Transfer = tuple[int, float] # (xfer_handle, start_time)
59
+ EngineId = str
60
+ ReqId = str
61
+
62
+ GET_META_MSG = b"get_meta_msg"
63
+
64
+ logger = init_logger(__name__)
65
+
66
+ # Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
67
+ try:
68
+ from nixl._api import nixl_agent as NixlWrapper
69
+ from nixl._bindings import nixlXferTelemetry
70
+
71
+ logger.info("NIXL is available")
72
+ except ImportError:
73
+ logger.warning("NIXL is not available")
74
+ NixlWrapper = None
75
+ nixlXferTelemetry = None
76
+
77
+
78
+ try:
79
+ from nixl._api import nixl_agent_config
80
+ except ImportError:
81
+ nixl_agent_config = None
82
+ logger.warning("NIXL agent config is not available")
83
+
84
+ # Supported platforms and types of kv transfer buffer.
85
+ # {device: tuple of supported kv buffer types}
86
+ _NIXL_SUPPORTED_DEVICE = {
87
+ "cuda": (
88
+ "cuda",
89
+ "cpu",
90
+ ),
91
+ "tpu": ("cpu",),
92
+ "xpu": ("cpu",),
93
+ "cpu": ("cpu",),
94
+ }
95
+ # support for oot platform by providing mapping in current_platform
96
+ _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices())
97
+
98
+
99
+ @dataclass
100
+ class NixlAgentMetadata(KVConnectorHandshakeMetadata):
101
+ engine_id: str
102
+ agent_metadata: bytes
103
+ kv_caches_base_addr: list[int]
104
+ device_id: int
105
+ num_blocks: int
106
+ block_lens: list[int]
107
+ attn_backend_name: str
108
+ kv_cache_layout: str
109
+ block_size: int
110
+
111
+
112
+ @dataclass
113
+ class ReqMeta:
114
+ local_block_ids: list[int]
115
+ # To be used when logical block size does not match the kernel block size
116
+ local_physical_block_ids: list[int]
117
+ remote_block_ids: list[int]
118
+ remote_host: str
119
+ remote_port: int
120
+ remote_engine_id: str
121
+ tp_size: int
122
+
123
+
124
+ class NixlConnectorMetadata(KVConnectorMetadata):
125
+ def __init__(self):
126
+ self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
127
+ self.reqs_to_save: dict[ReqId, ReqMeta] = {}
128
+ self.reqs_to_send: dict[ReqId, float] = {}
129
+ self.reqs_in_batch: set[ReqId] = set()
130
+ self.reqs_not_processed: set[ReqId] = set()
131
+
132
+ def add_new_req(
133
+ self,
134
+ request_id: ReqId,
135
+ local_block_ids: list[int],
136
+ kv_transfer_params: dict[str, Any],
137
+ load_remote_cache: bool = True,
138
+ save_to_host: bool = False,
139
+ ):
140
+ # save and load are mutually exclusive
141
+ assert load_remote_cache ^ save_to_host
142
+ _req = ReqMeta(
143
+ local_block_ids=local_block_ids,
144
+ local_physical_block_ids=local_block_ids,
145
+ remote_block_ids=kv_transfer_params["remote_block_ids"],
146
+ remote_engine_id=kv_transfer_params["remote_engine_id"],
147
+ remote_host=kv_transfer_params["remote_host"],
148
+ remote_port=kv_transfer_params["remote_port"],
149
+ # P workers don't need to receive tp_size from proxy here.
150
+ tp_size=kv_transfer_params.get("tp_size", 1),
151
+ )
152
+ if save_to_host:
153
+ self.reqs_to_save[request_id] = _req
154
+ if load_remote_cache:
155
+ self.reqs_to_recv[request_id] = _req
156
+
157
+
158
+ class NixlConnector(KVConnectorBase_V1):
159
+ def __init__(
160
+ self,
161
+ vllm_config: VllmConfig,
162
+ role: KVConnectorRole,
163
+ kv_cache_config: Optional["KVCacheConfig"] = None,
164
+ ):
165
+ super().__init__(vllm_config, role, kv_cache_config)
166
+
167
+ assert vllm_config.kv_transfer_config is not None
168
+ assert vllm_config.kv_transfer_config.engine_id is not None
169
+ self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
170
+
171
+ if role == KVConnectorRole.SCHEDULER:
172
+ self.connector_scheduler: NixlConnectorScheduler | None = (
173
+ NixlConnectorScheduler(vllm_config, self.engine_id)
174
+ )
175
+ self.connector_worker: NixlConnectorWorker | None = None
176
+ elif role == KVConnectorRole.WORKER:
177
+ self.connector_scheduler = None
178
+ self.connector_worker = NixlConnectorWorker(vllm_config, self.engine_id)
179
+
180
+ ############################################################
181
+ # Class Methods
182
+ ############################################################
183
+ @classmethod
184
+ def get_required_kvcache_layout(cls, vllm_config: VllmConfig):
185
+ if vllm_config.model_config is None:
186
+ logger.warning_once(
187
+ "Unable to detect current VLLM config. "
188
+ "Fallback to default kv cache layout."
189
+ )
190
+ return None
191
+ use_mla = vllm_config.model_config.use_mla
192
+ if use_mla:
193
+ # return None when we have mla
194
+ # as the layout should not matter in that case,
195
+ # which fallback to the default behavior.
196
+ return None
197
+ logger.info_once(
198
+ "NixlConnector setting KV cache layout to HND for better xfer performance."
199
+ )
200
+ return "HND"
201
+
202
+ ############################################################
203
+ # Scheduler Side Methods
204
+ ############################################################
205
+
206
+ def get_num_new_matched_tokens(
207
+ self, request: "Request", num_computed_tokens: int
208
+ ) -> tuple[int | None, bool]:
209
+ assert self.connector_scheduler is not None
210
+ return self.connector_scheduler.get_num_new_matched_tokens(
211
+ request, num_computed_tokens
212
+ )
213
+
214
+ def update_state_after_alloc(
215
+ self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
216
+ ):
217
+ assert self.connector_scheduler is not None
218
+ return self.connector_scheduler.update_state_after_alloc(
219
+ request, blocks, num_external_tokens
220
+ )
221
+
222
+ def build_connector_meta(
223
+ self,
224
+ scheduler_output: SchedulerOutput,
225
+ ) -> KVConnectorMetadata:
226
+ assert self.connector_scheduler is not None
227
+ return self.connector_scheduler.build_connector_meta(scheduler_output)
228
+
229
+ def request_finished(
230
+ self,
231
+ request: "Request",
232
+ block_ids: list[int],
233
+ ) -> tuple[bool, dict[str, Any] | None]:
234
+ assert self.connector_scheduler is not None
235
+ return self.connector_scheduler.request_finished(request, block_ids)
236
+
237
+ def set_xfer_handshake_metadata(
238
+ self, metadata: dict[int, KVConnectorHandshakeMetadata]
239
+ ) -> None:
240
+ """
241
+ Set the KV connector handshake metadata for this connector.
242
+
243
+ Args:
244
+ metadata (dict): the handshake metadata to set.
245
+ """
246
+ assert self.connector_scheduler is not None
247
+ self.connector_scheduler.set_xfer_handshake_metadata(metadata)
248
+
249
+ ############################################################
250
+ # Worker Side Methods
251
+ ############################################################
252
+ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
253
+ assert self.connector_worker is not None
254
+ self.connector_worker.register_kv_caches(kv_caches)
255
+
256
+ def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
257
+ assert self.connector_worker is not None
258
+ self.connector_worker.set_host_xfer_buffer_ops(copy_operation)
259
+
260
+ def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
261
+ """Get the finished recving and sending requests."""
262
+ assert self.connector_worker is not None
263
+ return self.connector_worker.get_finished()
264
+
265
+ def get_block_ids_with_load_errors(self) -> set[int]:
266
+ """Get block IDs that failed to load via NIXL."""
267
+ assert self.connector_worker is not None
268
+ return self.connector_worker.get_block_ids_with_load_errors()
269
+
270
+ def get_kv_connector_stats(self) -> KVConnectorStats | None:
271
+ if self.connector_worker is None:
272
+ return None
273
+ return self.connector_worker.get_kv_connector_stats()
274
+
275
+ @classmethod
276
+ def build_kv_connector_stats(
277
+ cls, data: dict[str, Any] | None = None
278
+ ) -> KVConnectorStats | None:
279
+ return (
280
+ NixlKVConnectorStats(data=data)
281
+ if data is not None
282
+ else NixlKVConnectorStats()
283
+ )
284
+
285
+ @classmethod
286
+ def build_prom_metrics(
287
+ cls,
288
+ vllm_config: VllmConfig,
289
+ metric_types: dict[type[PromMetric], type[PromMetricT]],
290
+ labelnames: list[str],
291
+ per_engine_labelvalues: dict[int, list[object]],
292
+ ) -> KVConnectorPromMetrics:
293
+ return NixlPromMetrics(
294
+ vllm_config, metric_types, labelnames, per_engine_labelvalues
295
+ )
296
+
297
+ def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
298
+ assert self.connector_worker is not None
299
+ assert isinstance(self._connector_metadata, NixlConnectorMetadata)
300
+ self.connector_worker.start_load_kv(self._connector_metadata)
301
+
302
+ def wait_for_layer_load(self, layer_name: str) -> None:
303
+ """NixlConnector does not do layerwise saving."""
304
+ pass
305
+
306
+ def save_kv_layer(
307
+ self,
308
+ layer_name: str,
309
+ kv_layer: torch.Tensor,
310
+ attn_metadata: AttentionMetadata,
311
+ **kwargs,
312
+ ) -> None:
313
+ """NixlConnector does not save explicitly."""
314
+ pass
315
+
316
+ def wait_for_save(self):
317
+ assert self.connector_worker is not None
318
+ assert isinstance(self._connector_metadata, NixlConnectorMetadata)
319
+ if self.connector_worker.use_host_buffer and self.connector_worker.copy_blocks:
320
+ self.connector_worker.save_kv_to_host(self._connector_metadata)
321
+
322
+ def shutdown(self):
323
+ if self.connector_worker is not None:
324
+ self.connector_worker.shutdown()
325
+ if self.connector_scheduler is not None:
326
+ self.connector_scheduler.shutdown()
327
+
328
+ def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
329
+ """
330
+ Get the KVConnector handshake metadata for this connector.
331
+ This metadata is used for out-of-band connector handshake
332
+ between P/D workers.
333
+
334
+ Returns:
335
+ KVConnectorHandshakeMetadata: the handshake metadata.
336
+ None if no handshake metadata is available.
337
+ """
338
+ assert self.connector_worker is not None
339
+ return self.connector_worker.xfer_handshake_metadata
340
+
341
+
342
+ class NixlConnectorScheduler:
343
+ """Implementation of Scheduler side methods"""
344
+
345
+ def __init__(self, vllm_config: VllmConfig, engine_id: str):
346
+ self.vllm_config = vllm_config
347
+ self.block_size = vllm_config.cache_config.block_size
348
+ self.engine_id: EngineId = engine_id
349
+ self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
350
+ self.side_channel_port = (
351
+ envs.VLLM_NIXL_SIDE_CHANNEL_PORT
352
+ + vllm_config.parallel_config.data_parallel_rank
353
+ )
354
+ assert vllm_config.kv_transfer_config is not None
355
+ if current_platform.device_type == "cpu":
356
+ self.use_host_buffer = False
357
+ else:
358
+ self.use_host_buffer = (
359
+ vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
360
+ )
361
+
362
+ logger.info("Initializing NIXL Scheduler %s", engine_id)
363
+
364
+ # Background thread for handling new handshake requests.
365
+ self._nixl_handshake_listener_t: threading.Thread | None = None
366
+ self._encoded_xfer_handshake_metadata: dict[int, Any] = {}
367
+ self._stop_event = threading.Event()
368
+
369
+ # Requests that need to start recv/send.
370
+ # New requests are added by update_state_after_alloc in
371
+ # the scheduler. Used to make metadata passed to Worker.
372
+ self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
373
+ self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
374
+ # Reqs to send and their expiration time
375
+ self._reqs_need_send: dict[ReqId, float] = {}
376
+ self._reqs_in_batch: set[ReqId] = set()
377
+ # Reqs to remove from processed set because they're not to send after
378
+ # remote prefill or aborted.
379
+ self._reqs_not_processed: set[ReqId] = set()
380
+
381
+ def shutdown(self):
382
+ self._stop_event.set()
383
+ if self._nixl_handshake_listener_t is not None:
384
+ self._nixl_handshake_listener_t.join()
385
+ self._nixl_handshake_listener_t = None
386
+
387
+ def set_xfer_handshake_metadata(
388
+ self, metadata: dict[int, KVConnectorHandshakeMetadata]
389
+ ) -> None:
390
+ """
391
+ Set the KV connector handshake metadata for this connector.
392
+
393
+ Args:
394
+ metadata (dict): the handshake metadata to set.
395
+ """
396
+ encoded_data: dict[int, bytes] = {}
397
+ encoder = msgspec.msgpack.Encoder()
398
+ for tp_rank, rank_metadata in metadata.items():
399
+ if not isinstance(rank_metadata, NixlAgentMetadata):
400
+ raise ValueError(
401
+ "NixlConnectorScheduler expects NixlAgentMetadata for "
402
+ "handshake metadata."
403
+ )
404
+ encoded_data[tp_rank] = encoder.encode(rank_metadata)
405
+ logger.debug(
406
+ "Tp rank %d: encoded NixlAgentMetadata size: %s bytes",
407
+ tp_rank,
408
+ str(len(encoded_data[tp_rank])),
409
+ )
410
+ self._encoded_xfer_handshake_metadata = encoded_data
411
+
412
+ # Only start the listener when we have metadata to serve.
413
+ if self._nixl_handshake_listener_t is None:
414
+ ready_event = threading.Event()
415
+ self._nixl_handshake_listener_t = threading.Thread(
416
+ target=self._nixl_handshake_listener,
417
+ args=(
418
+ encoded_data,
419
+ ready_event,
420
+ self._stop_event,
421
+ self.side_channel_port,
422
+ ),
423
+ daemon=True,
424
+ name="nixl_handshake_listener",
425
+ )
426
+ self._nixl_handshake_listener_t.start()
427
+ ready_event.wait() # Wait for listener ZMQ socket to be ready.
428
+
429
+ @staticmethod
430
+ def _nixl_handshake_listener(
431
+ encoded_data: dict[int, Any],
432
+ ready_event: threading.Event,
433
+ stop_event: threading.Event,
434
+ port: int,
435
+ ):
436
+ """Background thread for getting new NIXL handshakes."""
437
+ # NOTE(rob): this is a simple implementation. We will move
438
+ # to a better approach via HTTP endpoint soon.
439
+
440
+ # Listen for new requests for metadata.
441
+ host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
442
+ path = make_zmq_path("tcp", host, port)
443
+ logger.debug("Starting listening on path: %s", path)
444
+ with zmq_ctx(zmq.ROUTER, path) as sock:
445
+ sock.setsockopt(zmq.RCVTIMEO, 1000)
446
+ ready_event.set()
447
+ while True:
448
+ try:
449
+ identity, _, msg = sock.recv_multipart()
450
+ except zmq.Again:
451
+ if stop_event.is_set():
452
+ break
453
+ continue
454
+ # Decode the message which contains (GET_META_MSG, rank)
455
+ msg, target_tp_rank = msgspec.msgpack.decode(msg)
456
+ logger.debug(
457
+ "Received message for tp rank %s",
458
+ target_tp_rank,
459
+ )
460
+ if msg != GET_META_MSG:
461
+ logger.warning("Connection listener got unexpected message %s", msg)
462
+ sock.send_multipart((identity, b"", encoded_data[target_tp_rank]))
463
+
464
+ def get_num_new_matched_tokens(
465
+ self, request: "Request", num_computed_tokens: int
466
+ ) -> tuple[int, bool]:
467
+ """
468
+ For remote prefill, pull all prompt blocks from remote
469
+ asynchronously relative to engine execution.
470
+
471
+ Args:
472
+ request (Request): the request object.
473
+ num_computed_tokens (int): the number of locally
474
+ computed tokens for this request
475
+ Returns:
476
+ * the number of tokens that can be loaded from the
477
+ external KV cache beyond what is already computed.
478
+ * true if the external KV cache tokens will be loaded
479
+ asynchronously (between scheduler steps).
480
+ """
481
+
482
+ params = request.kv_transfer_params
483
+ logger.debug(
484
+ "NIXLConnector get_num_new_matched_tokens: "
485
+ "num_computed_tokens=%s, kv_transfer_params=%s",
486
+ num_computed_tokens,
487
+ params,
488
+ )
489
+
490
+ if params is not None and params.get("do_remote_prefill"):
491
+ # Remote prefill: get all prompt blocks from remote.
492
+ token_ids = request.prompt_token_ids or []
493
+ count = len(token_ids) - num_computed_tokens
494
+ if count > 0:
495
+ return count, True
496
+
497
+ # No remote prefill for this request.
498
+ return 0, False
499
+
500
+ def update_state_after_alloc(
501
+ self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
502
+ ):
503
+ params = request.kv_transfer_params
504
+ logger.debug(
505
+ "NIXLConnector update_state_after_alloc: "
506
+ "num_external_tokens=%s, kv_transfer_params=%s",
507
+ num_external_tokens,
508
+ params,
509
+ )
510
+
511
+ if not params:
512
+ return
513
+
514
+ if params.get("do_remote_decode"):
515
+ self._reqs_in_batch.add(request.request_id)
516
+ if self.use_host_buffer and params.get("do_remote_decode"):
517
+ # NOTE: when accelerator is not directly supported by Nixl,
518
+ # prefilled blocks need to be saved to host memory before transfer.
519
+
520
+ # save all blocks
521
+ block_ids = blocks.get_block_ids()[0]
522
+ # TODO: skip the blocks that are already in the host xfer buffer.
523
+ # Currently, the host xfer buffer block is 1-to-1 mapped to device
524
+ # kv blocks, so host blocks won't be flushed as long as its device
525
+ # block is not overwritten; and it will be safe to skip saving them
526
+ # to host xfer buffer.
527
+ if block_ids:
528
+ self._reqs_need_save[request.request_id] = (request, block_ids)
529
+ elif params.get("do_remote_prefill"):
530
+ if params.get("remote_block_ids"):
531
+ if all(
532
+ p in params
533
+ for p in ("remote_engine_id", "remote_host", "remote_port")
534
+ ):
535
+ # If remote_blocks and num_external_tokens = 0, we have
536
+ # a full prefix cache hit on the D worker. We need to call
537
+ # send_notif in _read_blocks to free the memory on the P.
538
+ local_block_ids = (
539
+ blocks.get_unhashed_block_ids()
540
+ if num_external_tokens > 0
541
+ else []
542
+ )
543
+ # Get unhashed blocks to pull from remote.
544
+ self._reqs_need_recv[request.request_id] = (
545
+ request,
546
+ local_block_ids,
547
+ )
548
+
549
+ else:
550
+ logger.warning(
551
+ "Got invalid KVTransferParams: %s. This "
552
+ "request will not utilize KVTransfer",
553
+ params,
554
+ )
555
+ else:
556
+ assert num_external_tokens == 0
557
+ # Only trigger 1 KV transfer per request.
558
+ params["do_remote_prefill"] = False
559
+
560
+ def build_connector_meta(
561
+ self,
562
+ scheduler_output: SchedulerOutput,
563
+ ) -> KVConnectorMetadata:
564
+ meta = NixlConnectorMetadata()
565
+
566
+ # Loop through scheduled reqs and convert to ReqMeta.
567
+ for req_id, (req, block_ids) in self._reqs_need_recv.items():
568
+ assert req.kv_transfer_params is not None
569
+ meta.add_new_req(
570
+ request_id=req_id,
571
+ local_block_ids=block_ids,
572
+ kv_transfer_params=req.kv_transfer_params,
573
+ load_remote_cache=True,
574
+ save_to_host=False,
575
+ )
576
+
577
+ for req_id, (req, block_ids) in self._reqs_need_save.items():
578
+ assert req.kv_transfer_params is not None
579
+ meta.add_new_req(
580
+ request_id=req_id,
581
+ local_block_ids=block_ids,
582
+ kv_transfer_params=req.kv_transfer_params,
583
+ load_remote_cache=False,
584
+ save_to_host=True,
585
+ )
586
+
587
+ meta.reqs_to_send = self._reqs_need_send
588
+ meta.reqs_in_batch = self._reqs_in_batch
589
+ meta.reqs_not_processed = self._reqs_not_processed
590
+
591
+ # Clear the list once workers start the transfers
592
+ self._reqs_need_recv.clear()
593
+ self._reqs_need_save.clear()
594
+ self._reqs_in_batch = set()
595
+ self._reqs_not_processed = set()
596
+ self._reqs_need_send = {}
597
+
598
+ return meta
599
+
600
+ def request_finished(
601
+ self,
602
+ request: "Request",
603
+ block_ids: list[int],
604
+ ) -> tuple[bool, dict[str, Any] | None]:
605
+ """
606
+ Once a request is finished, determine whether request blocks
607
+ should be freed now or will be sent asynchronously and freed later.
608
+ """
609
+ from vllm.v1.request import RequestStatus
610
+
611
+ params = request.kv_transfer_params
612
+ logger.debug(
613
+ "NIXLConnector request_finished(%s), request_status=%s, "
614
+ "kv_transfer_params=%s",
615
+ request.request_id,
616
+ request.status,
617
+ params,
618
+ )
619
+ if not params:
620
+ return False, None
621
+
622
+ if params.get("do_remote_prefill"):
623
+ # If do_remote_prefill is still True when the request is finished,
624
+ # update_state_after_alloc must not have been called (the request
625
+ # must have been aborted before it was scheduled).
626
+ # To avoid stranding the prefill blocks in the prefill instance,
627
+ # we must add empty block_ids to _reqs_need_recv so that our
628
+ # worker side will notify and free blocks in the prefill instance.
629
+ self._reqs_need_recv[request.request_id] = (request, [])
630
+ params["do_remote_prefill"] = False
631
+ return False, None
632
+
633
+ if not params.get("do_remote_decode"):
634
+ return False, None
635
+ if request.status != RequestStatus.FINISHED_LENGTH_CAPPED:
636
+ # Also include the case of a P/D Prefill request with immediate
637
+ # block free (eg abort). Stop tracking this request.
638
+ self._reqs_not_processed.add(request.request_id)
639
+ return False, None
640
+
641
+ # TODO: check whether block_ids actually ever be 0. If not we could
642
+ # remove the conditional below
643
+ delay_free_blocks = len(block_ids) > 0
644
+
645
+ if delay_free_blocks:
646
+ # Prefill request on remote. It will be read from D upon completion
647
+ logger.debug(
648
+ "NIXLConnector request_finished(%s) waiting for %d seconds "
649
+ "for remote decode to fetch blocks",
650
+ request.request_id,
651
+ envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT,
652
+ )
653
+ self._reqs_need_send[request.request_id] = (
654
+ time.perf_counter() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
655
+ )
656
+
657
+ return delay_free_blocks, dict(
658
+ do_remote_prefill=True,
659
+ do_remote_decode=False,
660
+ remote_block_ids=block_ids,
661
+ remote_engine_id=self.engine_id,
662
+ remote_host=self.side_channel_host,
663
+ remote_port=self.side_channel_port,
664
+ tp_size=self.vllm_config.parallel_config.tensor_parallel_size,
665
+ )
666
+
667
+
668
+ class NixlConnectorWorker:
669
+ """Implementation of Worker side methods"""
670
+
671
+ @dataclass
672
+ class TpKVTopology:
673
+ """
674
+ Helper class for tensor parallel and KV topology information for
675
+ mapping between local and remote TP workers.
676
+ """
677
+
678
+ tp_rank: int
679
+ remote_tp_size: dict[EngineId, int]
680
+ is_mla: bool
681
+ total_num_kv_heads: int
682
+ attn_backend: type[AttentionBackend]
683
+ engine_id: EngineId
684
+ remote_block_size: dict[EngineId, int]
685
+
686
+ def __post_init__(self):
687
+ # Figure out whether the first dimension of the cache is K/V
688
+ # or num_blocks. This is used to register the memory regions correctly.
689
+ kv_cache_shape = self.attn_backend.get_kv_cache_shape(
690
+ num_blocks=1, block_size=16, num_kv_heads=1, head_size=1
691
+ )
692
+ # Non-MLA backends caches have 5 dims [2, num_blocks, H,N,D],
693
+ # we just mock num_blocks to 1 for the dimension check below.
694
+ self._is_kv_layout_blocks_first = (
695
+ len(kv_cache_shape) == 5 and kv_cache_shape[0] == 1
696
+ )
697
+
698
+ attn_backend = AttentionBackendEnum[self.attn_backend.get_name()]
699
+ self._use_pallas = attn_backend == AttentionBackendEnum.PALLAS
700
+
701
+ @property
702
+ def is_kv_layout_blocks_first(self) -> bool:
703
+ return self._is_kv_layout_blocks_first
704
+
705
+ @property
706
+ def split_k_and_v(self) -> bool:
707
+ # Whether to register regions for K and V separately (when present).
708
+ return not (
709
+ self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
710
+ )
711
+
712
+ @property
713
+ def tp_size(self) -> int:
714
+ return self.remote_tp_size[self.engine_id]
715
+
716
+ @property
717
+ def block_size(self) -> int:
718
+ return self.remote_block_size[self.engine_id]
719
+
720
+ def tp_ratio(
721
+ self,
722
+ remote_tp_size: int,
723
+ ) -> int:
724
+ """
725
+ Calculate the tensor parallel ratio between local and remote TP.
726
+ We can think of it as the number of local TP workers-per-remote TP
727
+ workers. Local workers will read from the same remote TP worker in
728
+ groups of size `tp_ratio`.
729
+ """
730
+ assert self.tp_size % remote_tp_size == 0, (
731
+ f"Local tensor parallel size {self.tp_size} is not divisible "
732
+ f"by remote tensor parallel size {remote_tp_size}."
733
+ )
734
+ return self.tp_size // remote_tp_size
735
+
736
+ def block_size_ratio(
737
+ self,
738
+ remote_block_size: int,
739
+ ) -> float:
740
+ """
741
+ Calculate the block size ratio between local and remote TP.
742
+ """
743
+ assert self.block_size % remote_block_size == 0, (
744
+ f"Local block size {self.block_size} is not divisible "
745
+ f"by remote block size {remote_block_size} or vice versa."
746
+ )
747
+ return self.block_size // remote_block_size
748
+
749
+ def tp_ratio_from_engine_id(
750
+ self,
751
+ remote_engine_id: EngineId,
752
+ ) -> int:
753
+ remote_tp_size = self.remote_tp_size[remote_engine_id]
754
+ return self.tp_ratio(remote_tp_size)
755
+
756
+ def block_size_ratio_from_engine_id(
757
+ self,
758
+ remote_engine_id: EngineId,
759
+ ) -> float:
760
+ remote_block_size = self.remote_block_size[remote_engine_id]
761
+ return self.block_size_ratio(remote_block_size)
762
+
763
+ def is_kv_replicated(self, engine_id: EngineId) -> bool:
764
+ """
765
+ Whether the KV cache is replicated across TP workers due to the
766
+ number of TP workers being greater than the number of KV heads.
767
+ """
768
+ tp_size = self.remote_tp_size[engine_id]
769
+ return tp_size // self.total_num_kv_heads >= 1
770
+
771
+ def replicates_kv_cache(self, remote_engine_id: EngineId) -> bool:
772
+ # MLA is always replicated as the hidden dim can't be split.
773
+ return self.is_mla or self.is_kv_replicated(remote_engine_id)
774
+
775
+ def get_target_remote_rank(
776
+ self,
777
+ remote_tp_size: int,
778
+ ) -> int:
779
+ """
780
+ Get the remote TP rank (on P) that the current local TP rank
781
+ (on D) will read from.
782
+ """
783
+ tp_ratio = self.tp_ratio(remote_tp_size)
784
+ return self.tp_rank // tp_ratio
785
+
786
+ def get_target_remote_rank_from_engine_id(
787
+ self,
788
+ remote_engine_id: EngineId,
789
+ ) -> int:
790
+ remote_tp_size = self.remote_tp_size[remote_engine_id]
791
+ return self.get_target_remote_rank(remote_tp_size)
792
+
793
+ def __init__(self, vllm_config: VllmConfig, engine_id: str):
794
+ if NixlWrapper is None:
795
+ logger.error("NIXL is not available")
796
+ raise RuntimeError("NIXL is not available")
797
+ logger.info("Initializing NIXL wrapper")
798
+ logger.info("Initializing NIXL worker %s", engine_id)
799
+
800
+ # Config.
801
+ self.vllm_config = vllm_config
802
+ self.block_size = vllm_config.cache_config.block_size
803
+
804
+ if vllm_config.kv_transfer_config is None:
805
+ raise ValueError("kv_transfer_config must be set for NixlConnector")
806
+ self.kv_transfer_config = vllm_config.kv_transfer_config
807
+
808
+ self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
809
+ "backends", ["UCX"]
810
+ )
811
+
812
+ # Agent.
813
+ non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
814
+ # Configure NIXL num_threads to avoid UAR exhaustion on Mellanox NICs.
815
+ # Each UCX thread allocates UARs (doorbell pages) via DevX, and
816
+ # excessive NIXL UAR usage can exhaust NIC UAR space. This can cause
817
+ # components like NVSHMEM (used by DeepEP kernels) to fail during RDMA
818
+ # initialization with "mlx5dv_devx_alloc_uar" errors.
819
+ # Ref: https://network.nvidia.com/files/doc-2020/ethernet-adapters-programming-manual.pdf#page=63
820
+ num_threads = vllm_config.kv_transfer_config.get_from_extra_config(
821
+ "num_threads", 4
822
+ )
823
+ if nixl_agent_config is None:
824
+ config = None
825
+ else:
826
+ # Enable telemetry by default for NIXL 0.7.1 and above.
827
+ config = (
828
+ nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
829
+ if len(non_ucx_backends) > 0
830
+ else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
831
+ )
832
+
833
+ self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
834
+ # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
835
+ self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
836
+
837
+ # Metadata.
838
+ self.engine_id: EngineId = engine_id
839
+ self.tp_rank = get_tensor_model_parallel_rank()
840
+ self.world_size = get_tensor_model_parallel_world_size()
841
+ self.tp_group = get_tp_group()
842
+ self.num_blocks = 0
843
+ self.enable_permute_local_kv = False
844
+
845
+ # KV Caches and nixl tracking data.
846
+ self.device_type = current_platform.device_type
847
+ self.kv_buffer_device: str = vllm_config.kv_transfer_config.kv_buffer_device
848
+ if self.device_type not in _NIXL_SUPPORTED_DEVICE:
849
+ raise RuntimeError(f"{self.device_type} is not supported.")
850
+ elif self.kv_buffer_device not in _NIXL_SUPPORTED_DEVICE[self.device_type]:
851
+ raise RuntimeError(
852
+ f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
853
+ "is not supported."
854
+ )
855
+ self.device_kv_caches: dict[str, torch.Tensor] = {}
856
+
857
+ # cpu kv buffer for xfer
858
+ # used when device memory can not be registered under nixl
859
+ self.host_xfer_buffers: dict[str, torch.Tensor] = {}
860
+ if self.device_type == "cpu":
861
+ self.use_host_buffer = False
862
+ else:
863
+ self.use_host_buffer = self.kv_buffer_device == "cpu"
864
+
865
+ # support for oot platform which can't register nixl memory
866
+ # type based on kv_buffer_device
867
+ nixl_memory_type = current_platform.get_nixl_memory_type()
868
+ if nixl_memory_type is None:
869
+ if self.kv_buffer_device == "cuda":
870
+ nixl_memory_type = "VRAM"
871
+ elif self.kv_buffer_device == "cpu":
872
+ nixl_memory_type = "DRAM"
873
+ if nixl_memory_type is None:
874
+ raise RuntimeError(
875
+ f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
876
+ "is not supported."
877
+ )
878
+ self.nixl_memory_type = nixl_memory_type
879
+
880
+ # Note: host xfer buffer ops when use_host_buffer is True
881
+ self.copy_blocks: CopyBlocksOp | None = None
882
+
883
+ # Map of engine_id -> kv_caches_base_addr. For TP case, each local
884
+ # rank will still only pull from a single remote TP worker.
885
+ self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
886
+ self.device_id: int = 0
887
+
888
+ # Number of NIXL regions. Currently one region per cache
889
+ # (so 1 per layer for MLA, otherwise 2 per layer)
890
+ self.num_regions = 0
891
+ self.num_layers = 0
892
+
893
+ # nixl_prepped_dlist_handle.
894
+ self.src_xfer_side_handle: int = 0
895
+ self.src_xfer_side_handles: dict[int, int] = {}
896
+ # Map of engine_id -> nixl_prepped_dlist_handle (int)].
897
+ self.dst_xfer_side_handles: dict[EngineId, int] = {}
898
+
899
+ # Map of engine_id -> num_blocks. All ranks in the same deployment will
900
+ # have the same number of blocks.
901
+ self.dst_num_blocks: dict[EngineId, int] = {}
902
+ self._registered_descs: list[Any] = []
903
+
904
+ # In progress transfers.
905
+ # [req_id -> list[handle]]
906
+ self._recving_metadata: dict[ReqId, ReqMeta] = {}
907
+ self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
908
+ # Track the expiration time of requests that are waiting to be sent.
909
+ self._reqs_to_send: dict[ReqId, float] = {}
910
+ # Set of requests that have been part of a batch, regardless of status.
911
+ self._reqs_to_process: set[ReqId] = set()
912
+
913
+ # invalid blocks from failed NIXL operations
914
+ self._invalid_block_ids: set[int] = set()
915
+ # requests that skipped transfer (handshake or transfer failures)
916
+ self._failed_recv_reqs: set[ReqId] = set()
917
+
918
+ # Handshake metadata of this worker for NIXL transfers.
919
+ self.xfer_handshake_metadata: NixlAgentMetadata | None = None
920
+ # Background thread for initializing new NIXL handshakes.
921
+ self._handshake_initiation_executor = ThreadPoolExecutor(
922
+ # NIXL is not guaranteed to be thread-safe, limit 1 worker.
923
+ max_workers=1,
924
+ thread_name_prefix="vllm-nixl-handshake-initiator",
925
+ )
926
+ self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
927
+ self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {}
928
+ # Protects _handshake_futures and _remote_agents.
929
+ self._handshake_lock = threading.RLock()
930
+
931
+ self.block_size = vllm_config.cache_config.block_size
932
+ self.model_config = vllm_config.model_config
933
+ self.cache_config = vllm_config.cache_config
934
+
935
+ # TODO(mgoin): remove this once we have hybrid memory allocator
936
+ # Optimization for models with local attention (Llama 4)
937
+ # List of block window sizes for each layer for local attention
938
+ self.block_window_per_layer: list[int | None] = []
939
+ self.use_mla = self.model_config.use_mla
940
+
941
+ backend = get_attn_backend(
942
+ self.model_config.get_head_size(),
943
+ self.model_config.dtype,
944
+ self.cache_config.cache_dtype,
945
+ self.block_size,
946
+ use_mla=self.use_mla,
947
+ )
948
+ self.backend_name = backend.get_name()
949
+ self.kv_cache_layout = get_kv_cache_layout()
950
+ self.host_buffer_kv_cache_layout = self.kv_cache_layout
951
+ logger.debug("Detected attention backend %s", self.backend_name)
952
+ logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
953
+
954
+ self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
955
+ self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
956
+ # With heterogeneous TP, P must wait for all assigned D TP workers to
957
+ # finish reading before safely freeing the blocks.
958
+ self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
959
+ self.xfer_stats = NixlKVConnectorStats()
960
+
961
+ self.kv_topo = self.TpKVTopology(
962
+ tp_rank=self.tp_rank,
963
+ engine_id=self.engine_id,
964
+ remote_tp_size=self._tp_size, # shared state
965
+ remote_block_size=self._block_size, # shared state
966
+ is_mla=self.use_mla,
967
+ total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
968
+ attn_backend=backend,
969
+ )
970
+ self._use_pallas = self.kv_topo._use_pallas
971
+ self._physical_blocks_per_logical_kv_block = 1
972
+
973
+ def _nixl_handshake(
974
+ self,
975
+ host: str,
976
+ port: int,
977
+ remote_tp_size: int,
978
+ expected_engine_id: str,
979
+ ) -> dict[int, str]:
980
+ """Do a NIXL handshake with a remote instance."""
981
+
982
+ start_time = time.perf_counter()
983
+
984
+ # NOTE(rob): we need each rank to have a unique port. This is
985
+ # a hack to keep us moving. We will switch when moving to etcd
986
+ # or where we have a single ZMQ socket in the scheduler.
987
+
988
+ # Handshake only with the remote TP rank that current local rank will
989
+ # pull from. With homogeneous TP it happens to be the same rank_i.
990
+ p_remote_rank = self.kv_topo.get_target_remote_rank(remote_tp_size)
991
+ path = make_zmq_path("tcp", host, port)
992
+ logger.debug(
993
+ "Querying metadata on path: %s at remote tp rank %s", path, p_remote_rank
994
+ )
995
+
996
+ # Send query for the request.
997
+ with zmq_ctx(zmq.REQ, path) as sock:
998
+ msg = msgspec.msgpack.encode((GET_META_MSG, p_remote_rank))
999
+ # Set receive timeout to 5 seconds to avoid hanging on dead server
1000
+ sock.setsockopt(zmq.RCVTIMEO, 5000) # milliseconds
1001
+ sock.send(msg)
1002
+ metadata_bytes = sock.recv()
1003
+ decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
1004
+ metadata = decoder.decode(metadata_bytes)
1005
+ got_metadata_time = time.perf_counter()
1006
+ logger.debug(
1007
+ "NIXL handshake: get metadata took: %s", got_metadata_time - start_time
1008
+ )
1009
+
1010
+ # Ensure engine id matches.
1011
+ if metadata.engine_id != expected_engine_id:
1012
+ raise RuntimeError(
1013
+ f"Remote NIXL agent engine ID mismatch. "
1014
+ f"Expected {expected_engine_id},"
1015
+ f"received {metadata.engine_id}."
1016
+ )
1017
+
1018
+ # Register Remote agent.
1019
+ assert metadata.block_size <= self.block_size, (
1020
+ "nP > nD is not supported yet."
1021
+ )
1022
+ remote_agent_name = self.add_remote_agent(
1023
+ metadata, p_remote_rank, remote_tp_size
1024
+ )
1025
+
1026
+ setup_agent_time = time.perf_counter()
1027
+ logger.debug(
1028
+ "NIXL handshake: add agent took: %s",
1029
+ setup_agent_time - got_metadata_time,
1030
+ )
1031
+
1032
+ # Remote rank -> agent name.
1033
+ return {p_remote_rank: remote_agent_name}
1034
+
1035
+ def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None:
1036
+ """
1037
+ Initialize transfer buffer in CPU mem for accelerators
1038
+ NOT directly supported by NIXL (e.g., tpu)
1039
+ """
1040
+ xfer_buffers: dict[str, torch.Tensor] = {}
1041
+ inv_order = [0, 1, 3, 2, 4]
1042
+ try:
1043
+ for layer_name, kv_cache in kv_caches.items():
1044
+ kv_shape = kv_cache.shape
1045
+ kv_dtype = kv_cache.dtype
1046
+ permute_shape = False
1047
+ if (
1048
+ self.kv_cache_layout == "NHD"
1049
+ and self.vllm_config.kv_transfer_config is not None
1050
+ and self.vllm_config.kv_transfer_config.enable_permute_local_kv
1051
+ ):
1052
+ logger.info_once(
1053
+ "'enable_permute_local_kv' flag is enabled while "
1054
+ "device KV Layout is NHD. Init host buffer with"
1055
+ " HND to better support Decode/Prefill TP_ratio > 1."
1056
+ )
1057
+ # Since NHD will not support Decode/Prefill TP_ratio > 1,
1058
+ # we can leverage host_buffer for permute
1059
+ self.host_buffer_kv_cache_layout = "HND"
1060
+ kv_shape = (
1061
+ tuple(kv_shape[i] for i in inv_order)
1062
+ if not self.use_mla
1063
+ else kv_shape
1064
+ )
1065
+ permute_shape = not self.use_mla
1066
+
1067
+ xfer_buffers[layer_name] = torch.empty(
1068
+ kv_shape, dtype=kv_dtype, device="cpu"
1069
+ )
1070
+ if permute_shape:
1071
+ xfer_buffers[layer_name] = xfer_buffers[layer_name].permute(
1072
+ inv_order
1073
+ )
1074
+ except MemoryError as e:
1075
+ logger.error("NIXLConnectorWorker gets %s.", e)
1076
+ raise
1077
+
1078
+ self.host_xfer_buffers = xfer_buffers
1079
+
1080
+ def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
1081
+ """Assign copy (d2h, h2d) operations when host buffer is used."""
1082
+ # Set a no-op if the host buffer is not cpu.
1083
+ if self.kv_buffer_device != "cpu":
1084
+ return
1085
+ # Set a no-op if self.device_type is 'cpu'.
1086
+ if self.device_type == "cpu":
1087
+ return
1088
+ assert self.use_host_buffer
1089
+ self.copy_blocks = copy_operation
1090
+
1091
+ def _background_nixl_handshake(
1092
+ self, req_id: str, remote_engine_id: EngineId, meta: ReqMeta
1093
+ ):
1094
+ # Do NIXL handshake in background and add to _ready_requests when done.
1095
+ fut = self._handshake_futures.get(remote_engine_id)
1096
+ if fut is None:
1097
+ fut = self._handshake_initiation_executor.submit(
1098
+ self._nixl_handshake,
1099
+ meta.remote_host,
1100
+ meta.remote_port,
1101
+ meta.tp_size,
1102
+ remote_engine_id,
1103
+ )
1104
+ self._handshake_futures[remote_engine_id] = fut
1105
+
1106
+ def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
1107
+ with self._handshake_lock:
1108
+ del self._handshake_futures[eid]
1109
+ try:
1110
+ self._remote_agents[eid] = f.result()
1111
+ except Exception:
1112
+ logger.exception("Handshake with %s failed", eid)
1113
+
1114
+ fut.add_done_callback(done_callback)
1115
+
1116
+ # check handshake success before proceeding with request
1117
+ def request_ready(f: Future[Any], entry=(req_id, meta)):
1118
+ try:
1119
+ # check if handshake succeeded
1120
+ f.result()
1121
+ self._ready_requests.put(entry)
1122
+ except Exception:
1123
+ # handshake failed - mark blocks as invalid
1124
+ logger.exception(
1125
+ "Handshake failed for request %s, marking blocks as invalid", req_id
1126
+ )
1127
+ if req_meta := self._recving_metadata.get(req_id):
1128
+ self._invalid_block_ids.update(req_meta.local_block_ids)
1129
+ self._failed_recv_reqs.add(req_id)
1130
+
1131
+ fut.add_done_callback(request_ready)
1132
+
1133
+ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
1134
+ """Register the KV Cache data in nixl."""
1135
+
1136
+ if self.use_host_buffer:
1137
+ self.initialize_host_xfer_buffer(kv_caches=kv_caches)
1138
+ assert len(self.host_xfer_buffers) == len(kv_caches), (
1139
+ f"host_buffer: {len(self.host_xfer_buffers)}, "
1140
+ f"kv_caches: {len(kv_caches)}"
1141
+ )
1142
+ xfer_buffers = self.host_xfer_buffers
1143
+ else:
1144
+ xfer_buffers = kv_caches
1145
+ assert not self.host_xfer_buffers, (
1146
+ "host_xfer_buffer should not be initialized when "
1147
+ f"kv_buffer_device is {self.kv_buffer_device}"
1148
+ )
1149
+
1150
+ logger.info(
1151
+ "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, "
1152
+ "use_host_buffer: %s",
1153
+ self.use_mla,
1154
+ self.kv_buffer_device,
1155
+ self.use_host_buffer,
1156
+ )
1157
+
1158
+ caches_data = []
1159
+ # With hybrid allocator, layers can share a kv cache tensor
1160
+ seen_base_addresses = []
1161
+
1162
+ # Note(tms): I modified this from the original region setup code.
1163
+ # K and V are now in different regions. Advantage is that we can
1164
+ # elegantly support MLA and any cases where the K and V tensors
1165
+ # are non-contiguous (it's not locally guaranteed that they will be)
1166
+ # Disadvantage is that the encoded NixlAgentMetadata is now larger
1167
+ # (roughly 8KB vs 5KB).
1168
+ # Conversely for FlashInfer, K and V are registered in the same region
1169
+ # to better exploit the memory layout (ie num_blocks is the first dim).
1170
+ split_k_and_v = self.kv_topo.split_k_and_v
1171
+ tensor_size_bytes = None
1172
+
1173
+ # TODO (NickLucche): Get kernel_block_size in a cleaner way
1174
+ # NHD default "view" for non-MLA cache
1175
+ if self.device_type == "cpu":
1176
+ block_size_position = -2
1177
+ else:
1178
+ block_size_position = -2 if self.use_mla else -3
1179
+
1180
+ # Enable different block lengths for different layers when MLA is used.
1181
+ self.block_len_per_layer = list[int]()
1182
+ self.slot_size_per_layer = list[int]() # HD bytes in kv terms
1183
+ self.device_id = self.tp_rank
1184
+ for layer_name, cache_or_caches in xfer_buffers.items():
1185
+ cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
1186
+
1187
+ for cache in cache_list:
1188
+ base_addr = cache.data_ptr()
1189
+ if not self.use_host_buffer and current_platform.is_cuda_alike():
1190
+ self.device_id = cache.device.index
1191
+ if base_addr in seen_base_addresses:
1192
+ continue
1193
+
1194
+ kernel_block_size = cache.shape[block_size_position]
1195
+
1196
+ if self.block_size != kernel_block_size:
1197
+ logger.info_once(
1198
+ "User-specified logical block size (%s) does not match"
1199
+ " physical kernel block size (%s). Using the latter. ",
1200
+ self.block_size,
1201
+ kernel_block_size,
1202
+ )
1203
+ self._physical_blocks_per_logical_kv_block = (
1204
+ self.block_size // kernel_block_size
1205
+ )
1206
+ self.block_size = kernel_block_size
1207
+ self._block_size[self.engine_id] = kernel_block_size
1208
+
1209
+ seen_base_addresses.append(base_addr)
1210
+ curr_tensor_size_bytes = cache.numel() * cache.element_size()
1211
+
1212
+ if tensor_size_bytes is None:
1213
+ tensor_size_bytes = curr_tensor_size_bytes
1214
+ self.num_blocks = cache.shape[0]
1215
+
1216
+ assert cache.shape[0] == self.num_blocks, (
1217
+ "All kv cache tensors must have the same number of blocks"
1218
+ )
1219
+
1220
+ self.block_len_per_layer.append(
1221
+ curr_tensor_size_bytes // self.num_blocks
1222
+ )
1223
+ self.slot_size_per_layer.append(
1224
+ self.block_len_per_layer[-1] // self.block_size
1225
+ )
1226
+
1227
+ if not self.use_mla:
1228
+ # Different kv cache shape is not supported by HeteroTP
1229
+ assert tensor_size_bytes == curr_tensor_size_bytes, (
1230
+ "All kv cache tensors must have the same size"
1231
+ )
1232
+ # Need to make sure the device ID is non-negative for NIXL,
1233
+ # Torch uses -1 to indicate CPU tensors while NIXL uses explicit
1234
+ # memory type.
1235
+ self.device_id = max(cache.get_device(), 0)
1236
+ caches_data.append(
1237
+ (base_addr, curr_tensor_size_bytes, self.device_id, "")
1238
+ )
1239
+
1240
+ logger.debug(
1241
+ "Different block lengths collected: %s", set(self.block_len_per_layer)
1242
+ )
1243
+ assert len(self.block_len_per_layer) == len(seen_base_addresses)
1244
+ assert self.num_blocks != 0
1245
+
1246
+ self.kv_caches_base_addr[self.engine_id] = seen_base_addresses
1247
+ self.num_regions = len(caches_data)
1248
+ self.num_layers = len(xfer_buffers.keys())
1249
+
1250
+ descs = self.nixl_wrapper.get_reg_descs(caches_data, self.nixl_memory_type)
1251
+ logger.debug("Registering descs: %s", caches_data)
1252
+ self.nixl_wrapper.register_memory(descs, backends=self.nixl_backends)
1253
+ logger.debug("Done registering descs")
1254
+ self._registered_descs.append(descs)
1255
+
1256
+ self.device_kv_caches = kv_caches
1257
+ self.dst_num_blocks[self.engine_id] = self.num_blocks
1258
+ if self.kv_topo.is_kv_layout_blocks_first:
1259
+ for i in range(len(self.slot_size_per_layer)):
1260
+ assert self.slot_size_per_layer[i] % 2 == 0
1261
+ self.slot_size_per_layer[i] //= 2
1262
+
1263
+ # NOTE (NickLucche) When FlashInfer is used, memory is registered
1264
+ # with joint KV for each block. This minimizes the overhead in
1265
+ # registerMem allowing faster descs queries. In order to be able to
1266
+ # split on kv_heads dim as required by heterogeneous TP, one must
1267
+ # be able to index K/V separately. Hence we double the number
1268
+ # of 'virtual' regions here and halve `block_len` below.
1269
+ self.num_regions *= 2
1270
+
1271
+ # Register local/src descr for NIXL xfer.
1272
+ self.seen_base_addresses = seen_base_addresses
1273
+ self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size)
1274
+
1275
+ self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle
1276
+
1277
+ # TODO(mgoin): Hybrid memory allocator is currently disabled for
1278
+ # models with local attention (Llama 4). Can remove this once enabled.
1279
+ if self.model_config.hf_config.model_type == "llama4":
1280
+ from transformers import Llama4TextConfig
1281
+
1282
+ assert isinstance(self.model_config.hf_text_config, Llama4TextConfig)
1283
+ llama4_config = self.model_config.hf_text_config
1284
+ no_rope_layers = llama4_config.no_rope_layers
1285
+ chunk_size = llama4_config.attention_chunk_size
1286
+ chunk_block_size = math.ceil(chunk_size / self.block_size)
1287
+ for layer_idx in range(self.num_layers):
1288
+ # no_rope_layers[layer_idx] == 0 means NoPE (global)
1289
+ # Any other value means RoPE (local chunked)
1290
+ is_local_attention = no_rope_layers[layer_idx] != 0
1291
+ block_window = chunk_block_size if is_local_attention else None
1292
+ self.block_window_per_layer.append(block_window)
1293
+ logger.debug(
1294
+ "Llama 4 block window per layer mapping: %s",
1295
+ self.block_window_per_layer,
1296
+ )
1297
+ assert len(self.block_window_per_layer) == self.num_layers
1298
+
1299
+ # After KV Caches registered, listen for new connections.
1300
+ self.xfer_handshake_metadata = NixlAgentMetadata(
1301
+ engine_id=self.engine_id,
1302
+ agent_metadata=self.nixl_wrapper.get_agent_metadata(),
1303
+ kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
1304
+ device_id=self.device_id,
1305
+ num_blocks=self.num_blocks,
1306
+ block_lens=self.block_len_per_layer,
1307
+ attn_backend_name=self.backend_name,
1308
+ kv_cache_layout=self.kv_cache_layout
1309
+ if not self.use_host_buffer
1310
+ else self.host_buffer_kv_cache_layout,
1311
+ block_size=self.block_size,
1312
+ )
1313
+
1314
+ def register_local_xfer_handler(
1315
+ self,
1316
+ block_size: int,
1317
+ ) -> int:
1318
+ """
1319
+ Function used for register local xfer handler with local block_size or
1320
+ Remote block_size.
1321
+
1322
+ When local block_size is same as remote block_size, we use local block_size
1323
+ to register local_xfer_handler during init.
1324
+
1325
+ When remote block size is less than local block size, we need to use
1326
+ register another local_xfer_handler using remote block len to ensure
1327
+ data copy correctness.
1328
+ """
1329
+ block_size_ratio = self.block_size // block_size
1330
+ blocks_data = []
1331
+ for i, base_addr in enumerate(self.seen_base_addresses):
1332
+ # The new block_len is using prefill block_len;
1333
+ # and num_blocks is multiple with N
1334
+ kv_block_len = (
1335
+ self.get_backend_aware_kv_block_len(layer_idx=i) // block_size_ratio
1336
+ )
1337
+ block_len_per_layer = self.block_len_per_layer[i] // block_size_ratio
1338
+ num_blocks = self.num_blocks * block_size_ratio
1339
+ for block_id in range(num_blocks):
1340
+ block_offset = block_id * block_len_per_layer
1341
+ addr = base_addr + block_offset
1342
+ # (addr, len, device id)
1343
+ blocks_data.append((addr, kv_block_len, self.device_id))
1344
+
1345
+ if self.kv_topo.is_kv_layout_blocks_first:
1346
+ # Separate and interleave K/V regions to maintain the same
1347
+ # descs ordering. This is needed for selecting contiguous heads
1348
+ # when split across TP ranks.
1349
+ for block_id in range(num_blocks):
1350
+ block_offset = block_id * block_len_per_layer
1351
+ addr = base_addr + block_offset
1352
+ # Register addresses for V cache (K registered first).
1353
+ v_addr = addr + kv_block_len
1354
+ blocks_data.append((v_addr, kv_block_len, self.device_id))
1355
+ logger.debug(
1356
+ "Created %s blocks for src engine %s and rank %s on device id %s",
1357
+ len(blocks_data),
1358
+ self.engine_id,
1359
+ self.tp_rank,
1360
+ self.device_id,
1361
+ )
1362
+
1363
+ descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
1364
+ # NIXL_INIT_AGENT to be used for preparations of local descs.
1365
+ return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
1366
+
1367
+ def add_remote_agent(
1368
+ self,
1369
+ nixl_agent_meta: NixlAgentMetadata,
1370
+ remote_tp_rank: int = 0,
1371
+ remote_tp_size: int = 1,
1372
+ ) -> str:
1373
+ """
1374
+ Add the remote NIXL agent and prepare the descriptors for reading cache
1375
+ blocks from remote.
1376
+
1377
+ In particular, handle both homogeneous and heterogeneous TP. The former
1378
+ requires local rank_i to read from remote rank_i.
1379
+ The latter, assuming D.world_size > P.world_size, requires that two or
1380
+ more local TP worker share the xfer from a single TP worker.
1381
+
1382
+ Here's an example (non-MLA case):
1383
+
1384
+ rank_offset p_remote_tp_rank
1385
+ (kv split no)
1386
+ --------------------------------
1387
+ 0 0 Worker0 ---- 1st half of KV ----> Worker0 [ KV Cache ]
1388
+ /
1389
+ 1 0 Worker1 ---- 2nd half of KV -----/
1390
+
1391
+ 0 1 Worker2 ---- 1st half of KV ----> Worker1 [ KV Cache ]
1392
+ /
1393
+ 1 1 Worker3 ---- 2nd half of KV -----/
1394
+
1395
+
1396
+ Decoder TP workers Prefix TP workers
1397
+ (world_size=4) (world_size=2)
1398
+ tp_ratio = 4 // 2 = 2
1399
+
1400
+ Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]
1401
+ then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
1402
+ Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio
1403
+ first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
1404
+ along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.
1405
+
1406
+ Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
1407
+
1408
+ Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
1409
+ so that the whole cache is shared by "tp_ratio" D TP workers.
1410
+ """ # noqa: E501
1411
+ engine_id = nixl_agent_meta.engine_id
1412
+ # TODO re-evaluate refreshing for scaling/recovery
1413
+ if remote_tp_rank in self._remote_agents.get(engine_id, {}):
1414
+ logger.debug(
1415
+ "Remote agent with engine_id %s and rank"
1416
+ "%s already exchanged metadata, skip handshake.",
1417
+ engine_id,
1418
+ remote_tp_rank,
1419
+ )
1420
+ return self._remote_agents[engine_id][remote_tp_rank]
1421
+
1422
+ ### Register remote agent metadata
1423
+ if engine_id not in self._tp_size:
1424
+ self._tp_size[engine_id] = remote_tp_size
1425
+ if engine_id not in self._block_size:
1426
+ self._block_size[engine_id] = nixl_agent_meta.block_size
1427
+
1428
+ remote_agent_name = self.nixl_wrapper.add_remote_agent(
1429
+ nixl_agent_meta.agent_metadata
1430
+ )
1431
+
1432
+ # Handle tp_size>num_kv_heads: replicate KV cache.
1433
+ replicates_kv_cache = self.kv_topo.replicates_kv_cache(engine_id)
1434
+
1435
+ # Create dst descs and xfer side handles. TP workers have same #blocks
1436
+ # so we only register once per engine_id.
1437
+ # Example:
1438
+ # block_size_ratio > 1:
1439
+ # remote: | 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|
1440
+ # local origin:| 0| 1| 8| 12|
1441
+ # local mapped:| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|
1442
+ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(engine_id)
1443
+
1444
+ if engine_id not in self.dst_num_blocks:
1445
+ self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
1446
+
1447
+ # Keep track of remote agent kv caches base addresses.
1448
+ self.kv_caches_base_addr[engine_id] = nixl_agent_meta.kv_caches_base_addr
1449
+
1450
+ self._validate_remote_agent_handshake(nixl_agent_meta, remote_tp_size)
1451
+
1452
+ # Number of D TP workers reading from a single P TP worker. This is
1453
+ # 1 when P and D `--tensor-parallel-size` match.
1454
+ tp_ratio = self.kv_topo.tp_ratio_from_engine_id(engine_id)
1455
+
1456
+ ### Register remote agent memory regions
1457
+ blocks_data = []
1458
+ # With homogeneous TP, D pulls the whole kv cache from corresponding
1459
+ # rank. With heterogeneous TP, prepare the descriptors by splitting the
1460
+ # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
1461
+ # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
1462
+
1463
+ # Register all remote blocks, but only the corresponding kv heads.
1464
+ for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
1465
+ kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
1466
+ remote_kv_block_len = kv_block_len // block_size_ratio
1467
+ if block_size_ratio > 1:
1468
+ # using remote kv_block_len as transfer unit
1469
+ kv_block_len = remote_kv_block_len
1470
+ rank_offset = (
1471
+ self.tp_rank % tp_ratio * remote_kv_block_len
1472
+ if not replicates_kv_cache
1473
+ else 0
1474
+ )
1475
+ for block_id in range(nixl_agent_meta.num_blocks):
1476
+ block_offset = block_id * nixl_agent_meta.block_lens[i]
1477
+ # For each block, grab the heads chunk belonging to rank_i
1478
+ # of size remote_nheads // tp_ratio, which correspond to
1479
+ # self.block_len == remote_block_len//tp_ratio bytes.
1480
+ addr = base_addr + block_offset + rank_offset
1481
+ # (addr, len, device id)
1482
+ blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id))
1483
+
1484
+ if self.kv_topo.is_kv_layout_blocks_first:
1485
+ # With FlashInfer index V separately to allow head splitting.
1486
+ for block_id in range(nixl_agent_meta.num_blocks):
1487
+ block_offset = block_id * nixl_agent_meta.block_lens[i]
1488
+ addr = base_addr + block_offset + rank_offset
1489
+ v_addr = addr + nixl_agent_meta.block_lens[i] // 2
1490
+ blocks_data.append(
1491
+ (v_addr, kv_block_len, nixl_agent_meta.device_id)
1492
+ )
1493
+
1494
+ logger.debug(
1495
+ "Created %s blocks for dst engine %s with remote rank %s and local rank %s",
1496
+ len(blocks_data),
1497
+ engine_id,
1498
+ remote_tp_rank,
1499
+ self.tp_rank,
1500
+ )
1501
+
1502
+ # Register with NIXL.
1503
+ descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
1504
+ self.dst_xfer_side_handles[engine_id] = self.nixl_wrapper.prep_xfer_dlist(
1505
+ remote_agent_name, descs
1506
+ )
1507
+
1508
+ if block_size_ratio > 1:
1509
+ # when prefill with smaller block_size, we need to init a
1510
+ # new handler with same block_len to match
1511
+ self.src_xfer_side_handles[nixl_agent_meta.block_size] = (
1512
+ self.register_local_xfer_handler(nixl_agent_meta.block_size)
1513
+ )
1514
+
1515
+ return remote_agent_name
1516
+
1517
+ def _validate_remote_agent_handshake(
1518
+ self, nixl_agent_meta: NixlAgentMetadata, remote_tp_size: int
1519
+ ):
1520
+ """
1521
+ Validate the remote agent handshake metadata ensuring the
1522
+ invariants hold true.
1523
+ """
1524
+ remote_engine_id = nixl_agent_meta.engine_id
1525
+
1526
+ assert self._tp_size[remote_engine_id] == remote_tp_size
1527
+ # TODO We may eventually want to skip enforcing the same attn backend.
1528
+ assert nixl_agent_meta.attn_backend_name == self.backend_name
1529
+
1530
+ tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
1531
+ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
1532
+ remote_engine_id
1533
+ )
1534
+ assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
1535
+ assert not self._use_pallas or tp_ratio == 1, (
1536
+ "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
1537
+ )
1538
+ kv_cache_layout = (
1539
+ self.kv_cache_layout
1540
+ if not self.use_host_buffer
1541
+ else self.host_buffer_kv_cache_layout
1542
+ )
1543
+ if not self.use_mla and nixl_agent_meta.kv_cache_layout != kv_cache_layout:
1544
+ if (
1545
+ self.kv_transfer_config.enable_permute_local_kv
1546
+ and nixl_agent_meta.kv_cache_layout == "HND"
1547
+ ):
1548
+ logger.info(
1549
+ "Remote is HND and local is NHD, enabled additional permute "
1550
+ "on local device KV."
1551
+ )
1552
+ self.enable_permute_local_kv = True
1553
+ else:
1554
+ raise RuntimeError(
1555
+ "Heterogeneous TP expects same kv_cache_layout. "
1556
+ "Or enable experimental feature to use HND to NHD support by "
1557
+ "setting 'enable_permute_local_kv'=True in --kv-transfer-config."
1558
+ )
1559
+
1560
+ # Block len can only vary across layers when using MLA.
1561
+ remote_block_len = nixl_agent_meta.block_lens[0]
1562
+ if self.use_mla or self.kv_topo.is_kv_replicated(remote_engine_id):
1563
+ # With replicated KV cache, only the number of blocks can differ.
1564
+ for i in range(len(self.block_len_per_layer)):
1565
+ assert (
1566
+ self.block_len_per_layer[i] // block_size_ratio
1567
+ == nixl_agent_meta.block_lens[i]
1568
+ ), "KV cache sizes must match between P and D when replicated"
1569
+ else:
1570
+ # When MLA is not used, this is a list of the same block length
1571
+ for block_len in nixl_agent_meta.block_lens:
1572
+ assert block_len == remote_block_len, (
1573
+ "All remote layers must have the same block size"
1574
+ )
1575
+
1576
+ assert (
1577
+ remote_block_len
1578
+ == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
1579
+ ), (
1580
+ "Remote P worker KV layer cache must be of shape [2, N, "
1581
+ "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
1582
+ )
1583
+
1584
+ # TP workers have same #blocks.
1585
+ assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
1586
+
1587
+ assert len(nixl_agent_meta.kv_caches_base_addr) == len(self.block_len_per_layer)
1588
+
1589
+ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
1590
+ """copy recved kv from host buffer to device."""
1591
+ assert self.use_host_buffer
1592
+ assert self.copy_blocks is not None
1593
+
1594
+ local_block_ids = meta.local_physical_block_ids
1595
+ self.copy_blocks(
1596
+ self.host_xfer_buffers,
1597
+ self.device_kv_caches,
1598
+ local_block_ids,
1599
+ local_block_ids,
1600
+ "h2d",
1601
+ )
1602
+ if logger.isEnabledFor(logging.DEBUG):
1603
+ logger.debug(
1604
+ "synced recved kv of request[%s] to device kv buffer,"
1605
+ "local_block_ids: %s. ",
1606
+ req_id,
1607
+ ",".join(map(str, local_block_ids)),
1608
+ )
1609
+
1610
+ def save_kv_to_host(self, metadata: NixlConnectorMetadata):
1611
+ """copy kv from device to host buffer."""
1612
+ assert self.use_host_buffer
1613
+ assert self.copy_blocks is not None
1614
+
1615
+ for req_id, meta in metadata.reqs_to_save.items():
1616
+ meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
1617
+ meta.local_block_ids
1618
+ )
1619
+ if logger.isEnabledFor(logging.DEBUG):
1620
+ logger.debug(
1621
+ "save_load_kv for request[%s] to host xfer buffer."
1622
+ "local_block_ids: %s. ",
1623
+ req_id,
1624
+ ",".join(map(str, meta.local_physical_block_ids)),
1625
+ )
1626
+ # blocking
1627
+ self.copy_blocks(
1628
+ self.device_kv_caches,
1629
+ self.host_xfer_buffers,
1630
+ meta.local_physical_block_ids,
1631
+ meta.local_physical_block_ids,
1632
+ "d2h",
1633
+ )
1634
+
1635
+ def permute_device_kv(self, block_ids: list[int]):
1636
+ """Transforms the layout of received KV cache blocks to the local format.
1637
+
1638
+ This method corrects layout mismatches from direct memory copies by
1639
+ permuting the tensor dimensions.
1640
+
1641
+ - **Source Layout:** `[num_blocks, n_kv_head, block_size, head_dim]`
1642
+ - **Target Layout:** `[num_blocks, block_size, n_kv_head, head_dim]`
1643
+
1644
+ Args:
1645
+ block_ids: A list of block IDs to update and permute.
1646
+
1647
+ Implementation:
1648
+ - x = blocks_to_update.reshape(src_shape) # view local kv with sender layout
1649
+ - permuted_blocks = x.permute(*inv_order) # transpose n_kv_heads, block_size
1650
+ - cache.index_copy_(0, indices, permuted_blocks) # copy permuted kv back
1651
+
1652
+ """
1653
+ split_k_and_v = self.kv_topo.split_k_and_v
1654
+ inv_order = [0, 2, 1, 3]
1655
+ sample_cache = list(self.device_kv_caches.values())[0][0]
1656
+ target_shape = list(sample_cache.shape)
1657
+ target_shape[0] = -1
1658
+ src_shape = tuple(target_shape[i] for i in inv_order)
1659
+ indices = torch.tensor(block_ids, device=sample_cache.device)
1660
+
1661
+ for _, cache_or_caches in self.device_kv_caches.items():
1662
+ cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
1663
+ for cache in cache_list:
1664
+ blocks_to_update = cache.index_select(0, indices)
1665
+ permuted_blocks = blocks_to_update.reshape(src_shape).permute(
1666
+ *inv_order
1667
+ )
1668
+ cache.index_copy_(0, indices, permuted_blocks)
1669
+
1670
+ def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]):
1671
+ def _process_local_gt_remote(blocks_to_update, block_size_ratio):
1672
+ n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
1673
+ remote_block_size = block_size // block_size_ratio
1674
+ n_blocks = block_size_ratio
1675
+ # actual permute is to convert
1676
+ # for local blocksize > remote blocksize
1677
+ # ex: local blocksize = 16 tokens, remote blocksize = 4 tokens
1678
+ # local block[0] = remote block[0, 1, 2, 3]
1679
+ # remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|...
1680
+ # local is |h0-b0..................|h1-b0..................|...
1681
+ # permute is to:
1682
+ # 1. view => view remote as n_blocks * remote_shape(H,remoteN,D)
1683
+ # 2. permute => (H, nblocks, remoteN, D)
1684
+ # 3. flatten => (H, localN, D)
1685
+ permuted_blocks = (
1686
+ blocks_to_update.reshape(
1687
+ -1, n_blocks, n_kv_heads, remote_block_size, head_size
1688
+ )
1689
+ .permute(0, 2, 1, 3, 4)
1690
+ .flatten(2, 3)
1691
+ )
1692
+ return permuted_blocks
1693
+
1694
+ if len(self.device_kv_caches) == 0:
1695
+ return
1696
+ split_k_and_v = not (
1697
+ self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first
1698
+ )
1699
+ sample_cache = list(self.device_kv_caches.values())[0][0]
1700
+ for block_size_ratio, block_ids_list in block_ids_per_ratio.items():
1701
+ assert block_size_ratio > 1, "Only nP < nD supported currently."
1702
+ block_ids_list = [[item for sublist in block_ids_list for item in sublist]]
1703
+
1704
+ for block_ids in block_ids_list:
1705
+ indices = torch.tensor(block_ids, device=sample_cache.device)
1706
+
1707
+ for _, cache_or_caches in self.device_kv_caches.items():
1708
+ cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
1709
+ for cache in cache_list:
1710
+ blocks_to_update = cache.index_select(0, indices)
1711
+ # because kv_cache is always using original layout NHD as
1712
+ # virtual shape while stride can be either HND / NHD at
1713
+ # initialization.
1714
+ # we need to firstly get physical view of the tensor
1715
+ permuted_blocks = _process_local_gt_remote(
1716
+ blocks_to_update.permute(0, 2, 1, 3), block_size_ratio
1717
+ ).permute(0, 2, 1, 3)
1718
+ cache.index_copy_(0, indices, permuted_blocks)
1719
+
1720
+ def get_finished(self) -> tuple[set[str], set[str]]:
1721
+ """
1722
+ Get requests that are done sending or recving on this specific worker.
1723
+ The scheduler process (via the MultiprocExecutor) will use this output
1724
+ to track which workers are done.
1725
+ """
1726
+ done_sending = self._get_new_notifs()
1727
+ done_recving = self._pop_done_transfers(self._recving_transfers)
1728
+
1729
+ # add requests that skipped transfer to done_recving
1730
+ done_recving.update(self._failed_recv_reqs)
1731
+ self._failed_recv_reqs.clear()
1732
+
1733
+ if len(done_sending) > 0 or len(done_recving) > 0:
1734
+ logger.debug(
1735
+ "Rank %s, get_finished: %s requests done sending "
1736
+ "and %s requests done recving",
1737
+ self.tp_rank,
1738
+ len(done_sending),
1739
+ len(done_recving),
1740
+ )
1741
+
1742
+ block_ids_to_permute = []
1743
+ block_ids_for_blocksize_post_process = defaultdict(list)
1744
+ for req_id in done_recving:
1745
+ # clean up metadata for completed requests
1746
+ meta = self._recving_metadata.pop(req_id, None)
1747
+ assert meta is not None, f"{req_id} not found in recving_metadata list"
1748
+ if self.use_host_buffer:
1749
+ self.sync_recved_kv_to_device(req_id, meta)
1750
+ if self.enable_permute_local_kv:
1751
+ block_ids_to_permute += meta.local_physical_block_ids
1752
+
1753
+ # post processing for heteroblocksize
1754
+ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
1755
+ meta.remote_engine_id
1756
+ )
1757
+ if (
1758
+ not self.use_mla
1759
+ and block_size_ratio > 1
1760
+ and self.kv_cache_layout == "HND"
1761
+ ):
1762
+ block_ids_for_blocksize_post_process[block_size_ratio].append(
1763
+ meta.local_block_ids
1764
+ )
1765
+ self.blocksize_post_process(block_ids_for_blocksize_post_process)
1766
+ if len(block_ids_to_permute) > 0:
1767
+ self.permute_device_kv(block_ids_to_permute)
1768
+
1769
+ # Handle timeout to avoid stranding blocks on remote.
1770
+ now = time.perf_counter()
1771
+ while self._reqs_to_send:
1772
+ req_id, expires = next(iter(self._reqs_to_send.items()))
1773
+ # Sorted dict, oldest requests are put first so we can exit early.
1774
+ if now < expires:
1775
+ break
1776
+ count = self.consumer_notification_counts_by_req.pop(req_id, 0)
1777
+ logger.warning(
1778
+ "Releasing expired KV blocks for request %s which were "
1779
+ "retrieved by %d decode worker(s) within %d seconds.",
1780
+ req_id,
1781
+ count,
1782
+ envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT,
1783
+ )
1784
+ self._reqs_to_process.remove(req_id)
1785
+ del self._reqs_to_send[req_id]
1786
+ done_sending.add(req_id)
1787
+
1788
+ return done_sending, done_recving
1789
+
1790
+ def _get_new_notifs(self) -> set[str]:
1791
+ """
1792
+ Get req_ids which got a remote xfer message. When multiple consumers
1793
+ are reading from the same producer (heterogeneous TP scenario), wait
1794
+ for all consumers to be done pulling.
1795
+ """
1796
+ notified_req_ids: set[str] = set()
1797
+ for notifs in self.nixl_wrapper.get_new_notifs().values():
1798
+ for notif in notifs:
1799
+ req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
1800
+ if (
1801
+ req_id not in self._reqs_to_send
1802
+ and req_id not in self._reqs_to_process
1803
+ ):
1804
+ logger.error(
1805
+ "Potentially invalid KV blocks for "
1806
+ "unrecognized request %s were retrieved by "
1807
+ "a decode worker. They may have expired.",
1808
+ req_id,
1809
+ )
1810
+ continue
1811
+
1812
+ self.consumer_notification_counts_by_req[req_id] += 1
1813
+ # Wait all consumers (D) to be done reading before freeing.
1814
+ if self.consumer_notification_counts_by_req[req_id] == int(tp_ratio):
1815
+ notified_req_ids.add(req_id)
1816
+ del self.consumer_notification_counts_by_req[req_id]
1817
+ self._reqs_to_process.remove(req_id)
1818
+ self._reqs_to_send.pop(req_id, None)
1819
+ return notified_req_ids
1820
+
1821
+ def _pop_done_transfers(
1822
+ self, transfers: dict[str, list[tuple[int, float]]]
1823
+ ) -> set[str]:
1824
+ """
1825
+ Pop completed xfers by checking for DONE state.
1826
+ Args:
1827
+ transfers: dict of req_id -> list[running_xfer]
1828
+ Returns:
1829
+ set of req_ids that have all done xfers
1830
+ """
1831
+ done_req_ids: set[str] = set()
1832
+ for req_id, handles in list(transfers.items()):
1833
+ in_progress = False
1834
+ for handle, xfer_start_time in handles:
1835
+ try:
1836
+ xfer_state = self.nixl_wrapper.check_xfer_state(handle)
1837
+ if xfer_state == "DONE":
1838
+ # Get telemetry from NIXL
1839
+ res = self.nixl_wrapper.get_xfer_telemetry(handle)
1840
+ self.xfer_stats.record_transfer(res)
1841
+ self.nixl_wrapper.release_xfer_handle(handle)
1842
+ elif xfer_state == "PROC":
1843
+ in_progress = True
1844
+ continue
1845
+ else:
1846
+ logger.error(
1847
+ "NIXL transfer failed for request %s with state "
1848
+ "%s. Marking blocks as invalid.",
1849
+ req_id,
1850
+ xfer_state,
1851
+ )
1852
+ self._handle_failed_transfer(req_id, handle)
1853
+ in_progress = False
1854
+ except Exception:
1855
+ logger.exception(
1856
+ "NIXL transfer exception for request %s. "
1857
+ "Marking blocks as invalid.",
1858
+ req_id,
1859
+ )
1860
+ self._handle_failed_transfer(req_id, handle)
1861
+ in_progress = False
1862
+
1863
+ if not in_progress:
1864
+ done_req_ids.add(req_id)
1865
+ del transfers[req_id]
1866
+ return done_req_ids
1867
+
1868
+ def _handle_failed_transfer(self, req_id: str, handle: int):
1869
+ """
1870
+ Handle a failed transfer by marking all (logical) blocks as invalid and
1871
+ recording the failure.
1872
+
1873
+ Args:
1874
+ req_id: The request ID.
1875
+ handle: The transfer handle.
1876
+ """
1877
+ if meta := self._recving_metadata.pop(req_id, None):
1878
+ self._invalid_block_ids.update(meta.local_block_ids)
1879
+ self._recving_metadata.pop(req_id, None)
1880
+ self.nixl_wrapper.release_xfer_handle(handle)
1881
+ self.xfer_stats.record_failed_transfer()
1882
+
1883
+ def start_load_kv(self, metadata: NixlConnectorMetadata):
1884
+ """
1885
+ Start loading by triggering non-blocking nixl_xfer.
1886
+ We check for these trnxs to complete in each step().
1887
+ """
1888
+ for req_id, meta in metadata.reqs_to_recv.items():
1889
+ meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
1890
+ meta.local_block_ids
1891
+ )
1892
+ meta.remote_block_ids = self._logical_to_kernel_block_ids(
1893
+ meta.remote_block_ids
1894
+ )
1895
+ remote_engine_id = meta.remote_engine_id
1896
+ logger.debug(
1897
+ "start_load_kv for request %s from remote engine %s. "
1898
+ "Num local_block_ids: %s. Num remote_block_ids: %s. ",
1899
+ req_id,
1900
+ remote_engine_id,
1901
+ len(meta.local_physical_block_ids),
1902
+ len(meta.remote_block_ids),
1903
+ )
1904
+ # always store metadata for failure recovery
1905
+ self._recving_metadata[req_id] = meta
1906
+ if remote_engine_id not in self._remote_agents:
1907
+ # Initiate handshake with remote engine to exchange metadata.
1908
+ with self._handshake_lock:
1909
+ if remote_engine_id not in self._remote_agents:
1910
+ self._background_nixl_handshake(req_id, remote_engine_id, meta)
1911
+ continue
1912
+
1913
+ # Handshake already completed, start async read xfer.
1914
+ self._read_blocks_for_req(req_id, meta)
1915
+
1916
+ # Start transfers for requests whose handshakes have now finished.
1917
+ while not self._ready_requests.empty():
1918
+ self._read_blocks_for_req(*self._ready_requests.get_nowait())
1919
+
1920
+ # Keep around the requests that have been part of a batch. This is
1921
+ # needed because async scheduling pushes the misalignment between the
1922
+ # moment in which requests expiration is set (P side) and the moment in
1923
+ # which blocks are read from D. As P can now more easily lag behind D
1924
+ # while processing the next batch, we make sure to only set an
1925
+ # expiration for requests that have not been read from D yet.
1926
+ for req_id in metadata.reqs_in_batch:
1927
+ self._reqs_to_process.add(req_id)
1928
+
1929
+ # Remove all requests that are not to be processed (eg aborted).
1930
+ for req_id in metadata.reqs_not_processed:
1931
+ self._reqs_to_process.discard(req_id)
1932
+ # We should never get an abort after setting an expiry timer
1933
+ assert req_id not in self._reqs_to_send
1934
+
1935
+ # Add to requests that are waiting to be read and track expiration.
1936
+ for req_id, expiration_time in metadata.reqs_to_send.items():
1937
+ if req_id in self._reqs_to_process:
1938
+ self._reqs_to_send[req_id] = expiration_time
1939
+
1940
+ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
1941
+ logger.debug(
1942
+ "Remote agent %s available, calling _read_blocks for req %s",
1943
+ meta.remote_engine_id,
1944
+ req_id,
1945
+ )
1946
+ self._read_blocks(
1947
+ request_id=req_id,
1948
+ dst_engine_id=meta.remote_engine_id,
1949
+ local_block_ids=meta.local_physical_block_ids,
1950
+ remote_block_ids=meta.remote_block_ids,
1951
+ )
1952
+
1953
+ def _read_blocks(
1954
+ self,
1955
+ local_block_ids: list[int],
1956
+ remote_block_ids: list[int],
1957
+ dst_engine_id: str,
1958
+ request_id: str,
1959
+ ):
1960
+ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
1961
+ if block_size_ratio > 1:
1962
+ local_block_ids = self.get_mapped_blocks(
1963
+ np.asarray(local_block_ids), block_size_ratio
1964
+ )
1965
+ if len(local_block_ids) > len(remote_block_ids):
1966
+ # NOTE:
1967
+ # get_mapped_blocks will always expand block_ids for n times.
1968
+ # ex:
1969
+ # prefill block_ids with block_size as 4:
1970
+ # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
1971
+ # Local decode block_ids with block_size as 16: [1, 2, 3]
1972
+ # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to
1973
+ # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
1974
+ # Then we clip local to align with prefill
1975
+ # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to
1976
+ # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
1977
+ local_block_ids = local_block_ids[: len(remote_block_ids)]
1978
+ # NOTE(rob): having the staging blocks be on the READER side is
1979
+ # not going to work well (since we will have to call rearrange tensors).
1980
+ # after we detect the txn is complete (which means we cannot make the
1981
+ # read trxn async easily). If we want to make "READ" happen cleanly,
1982
+ # then we will need to have the staging blocks on the remote side.
1983
+
1984
+ # NOTE(rob): according to nvidia the staging blocks are used to
1985
+ # saturate IB with heterogeneous TP sizes. We should remove the staging
1986
+ # blocks until we are ready.
1987
+
1988
+ # Number of D TP workers that will read from dst P. Propagate tp_ratio
1989
+ # on notification so that dst worker can wait before freeing blocks.
1990
+ tp_ratio = self.kv_topo.tp_ratio_from_engine_id(dst_engine_id)
1991
+ notif_id = f"{request_id}:{tp_ratio}".encode()
1992
+
1993
+ # Full prefix cache hit: do not need to read remote blocks,
1994
+ # just notify P worker that we have the blocks we need.
1995
+ num_local_blocks = len(local_block_ids)
1996
+ if num_local_blocks == 0:
1997
+ remote_rank = self.kv_topo.get_target_remote_rank_from_engine_id(
1998
+ dst_engine_id
1999
+ )
2000
+ agent_name = self._remote_agents[dst_engine_id][remote_rank]
2001
+ try:
2002
+ self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
2003
+ except Exception:
2004
+ logger.exception(
2005
+ "NIXL send_notif failed for request %s: "
2006
+ "P worker blocks will be freed after timeout. "
2007
+ "This may indicate network issues.",
2008
+ request_id,
2009
+ )
2010
+ self.xfer_stats.record_failed_notification()
2011
+ return
2012
+
2013
+ # Partial prefix cache hit: just read uncomputed blocks.
2014
+ num_remote_blocks = len(remote_block_ids)
2015
+ assert num_local_blocks <= num_remote_blocks
2016
+ if num_local_blocks < num_remote_blocks:
2017
+ remote_block_ids = remote_block_ids[-num_local_blocks:]
2018
+
2019
+ # Get side handles.
2020
+ remote_block_size = self.kv_topo.remote_block_size[dst_engine_id]
2021
+ local_xfer_side_handle = self.src_xfer_side_handles.get(
2022
+ remote_block_size, self.src_xfer_side_handle
2023
+ )
2024
+ remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
2025
+
2026
+ # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
2027
+ # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
2028
+ # workers will issue xfers to parts of the P worker remote kv caches.
2029
+
2030
+ # Get descs ids.
2031
+ local_block_descs_ids: np.ndarray
2032
+ remote_block_descs_ids: np.ndarray
2033
+
2034
+ if not self.block_window_per_layer:
2035
+ # Default case: assume global attention
2036
+ remote_block_descs_ids = self._get_block_descs_ids(
2037
+ dst_engine_id,
2038
+ remote_block_ids,
2039
+ )
2040
+ local_block_descs_ids = self._get_block_descs_ids(
2041
+ self.engine_id,
2042
+ local_block_ids,
2043
+ block_size_ratio=block_size_ratio,
2044
+ )
2045
+ else:
2046
+ # TODO(mgoin): remove this once we have hybrid memory allocator
2047
+ # Optimization for models with local attention (Llama 4)
2048
+ local_descs_list = []
2049
+ remote_descs_list = []
2050
+ for layer_idx, block_window in enumerate(self.block_window_per_layer):
2051
+ # For each layer:
2052
+ if block_window is None:
2053
+ # If not chunked, we just use the
2054
+ # full block lists (global attention)
2055
+ layer_local_block_ids = local_block_ids
2056
+ layer_remote_block_ids = remote_block_ids
2057
+ else:
2058
+ # If chunked, get the last block_window blocks
2059
+ layer_local_block_ids = local_block_ids[-block_window:]
2060
+ layer_remote_block_ids = remote_block_ids[-block_window:]
2061
+
2062
+ # Get descs ids for the layer.
2063
+ layer_local_desc_ids = self._get_block_descs_ids(
2064
+ dst_engine_id,
2065
+ layer_local_block_ids,
2066
+ layer_idx,
2067
+ )
2068
+ layer_remote_desc_ids = self._get_block_descs_ids(
2069
+ self.engine_id,
2070
+ layer_remote_block_ids,
2071
+ layer_idx,
2072
+ block_size_ratio=block_size_ratio,
2073
+ )
2074
+
2075
+ local_descs_list.append(layer_local_desc_ids)
2076
+ remote_descs_list.append(layer_remote_desc_ids)
2077
+
2078
+ local_block_descs_ids = np.concatenate(local_descs_list)
2079
+ remote_block_descs_ids = np.concatenate(remote_descs_list)
2080
+
2081
+ assert len(local_block_descs_ids) == len(remote_block_descs_ids)
2082
+
2083
+ # Prepare transfer with Nixl.
2084
+ handle = None
2085
+ try:
2086
+ handle = self.nixl_wrapper.make_prepped_xfer(
2087
+ "READ",
2088
+ local_xfer_side_handle,
2089
+ local_block_descs_ids,
2090
+ remote_xfer_side_handle,
2091
+ remote_block_descs_ids,
2092
+ notif_msg=notif_id,
2093
+ )
2094
+
2095
+ # Begin async xfer.
2096
+ self.nixl_wrapper.transfer(handle)
2097
+
2098
+ # Use handle to check completion in future step().
2099
+ self._recving_transfers[request_id].append((handle, time.perf_counter()))
2100
+ except Exception:
2101
+ logger.exception(
2102
+ "NIXL transfer setup/initiation failed for request %s. "
2103
+ "Marking blocks as invalid.",
2104
+ request_id,
2105
+ )
2106
+ # mark all (logical) blocks for this request as invalid
2107
+ if meta := self._recving_metadata.get(request_id):
2108
+ self._invalid_block_ids.update(meta.local_block_ids)
2109
+ self.xfer_stats.record_failed_transfer()
2110
+ if handle is not None:
2111
+ self.nixl_wrapper.release_xfer_handle(handle)
2112
+ self._failed_recv_reqs.add(request_id)
2113
+
2114
+ def get_mapped_blocks(self, block_ids, block_size_ratio):
2115
+ """
2116
+ Calculates the new set of block IDs by mapping every element
2117
+ in the (potentially sparse) input array.
2118
+ Example: block_ids=[0, 2], block_size_ratio=2
2119
+ get_mapped_blocks 0 1 [2 3] 4 5
2120
+ # remote is |h0-b0|h1-b0||h0-b1|h1-b1||h0-b1|h1-b1||
2121
+ # local is |h0-b0......||h1-b0......||h2-b0........
2122
+ local_block_ids 0 [1] 2
2123
+ """
2124
+ if block_ids.size == 0:
2125
+ return np.array([], dtype=np.int64)
2126
+
2127
+ start_ids = block_ids * block_size_ratio
2128
+ offsets = np.arange(block_size_ratio)
2129
+ mapped_2d = start_ids[:, None] + offsets[None, :]
2130
+
2131
+ return mapped_2d.flatten().astype(np.int64)
2132
+
2133
+ def _get_block_descs_ids(
2134
+ self,
2135
+ engine_id: str,
2136
+ block_ids: list[int],
2137
+ layer_idx: int | None = None,
2138
+ block_size_ratio: float | None = None,
2139
+ ) -> np.ndarray:
2140
+ """
2141
+ Get the descs ids for a set of block ids.
2142
+ If layer_idx is provided, we use the region_ids for the given layer.
2143
+ Otherwise, we use all regions.
2144
+ """
2145
+ if layer_idx is None:
2146
+ region_ids = np.arange(self.num_regions)
2147
+ else:
2148
+ assert layer_idx < self.num_layers
2149
+ if self.num_layers < self.num_regions:
2150
+ # If we have more regions than layers, we assume that
2151
+ # the regions are organized as [K0, V0, K1, V1, ...]
2152
+ # and we select K_i and V_i
2153
+ assert 2 * self.num_layers == self.num_regions
2154
+ region_ids = np.arange(2 * layer_idx, 2 * layer_idx + 2)
2155
+ else:
2156
+ # Otherwise, we assume we have MLA and select i-th layer
2157
+ assert self.num_layers == self.num_regions
2158
+ region_ids = np.arange(layer_idx, layer_idx + 1)
2159
+
2160
+ num_blocks = self.dst_num_blocks[engine_id]
2161
+ if block_size_ratio is not None:
2162
+ num_blocks = int(num_blocks * block_size_ratio)
2163
+
2164
+ # Compute the desc ids for each block.
2165
+ region_ids = region_ids[:, None]
2166
+ block_ids = np.array(block_ids)[None, :]
2167
+ descs_ids = region_ids * num_blocks + block_ids
2168
+ return descs_ids.flatten()
2169
+
2170
+ def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]:
2171
+ """
2172
+ Convert logical block ids to kernel physical block ids.
2173
+ This is required when the logical block size (the one set by the user)
2174
+ does not match the one required by the attn backend.
2175
+ """
2176
+ if self._physical_blocks_per_logical_kv_block == 1:
2177
+ # Noop when physical and logical block sizes are the same
2178
+ return block_ids
2179
+ block_ids_np = np.array(block_ids)
2180
+ block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape(
2181
+ 1, -1
2182
+ )
2183
+ return BlockTable.map_to_kernel_blocks(
2184
+ block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
2185
+ ).tolist()
2186
+
2187
+ def get_backend_aware_kv_block_len(self, layer_idx: int):
2188
+ """
2189
+ Get the block length for one K/V element (K and V have the same size).
2190
+
2191
+ For FA and other backends, this is equal to the length of the whole
2192
+ block, as K and V are in separate regions.
2193
+ For FlashInfer, this is half the length of the whole block, as K and V
2194
+ share the same region.
2195
+ """
2196
+ if self.kv_topo.is_kv_layout_blocks_first:
2197
+ # For indexing only half (either just the K or V part).
2198
+ block_len = self.block_len_per_layer[layer_idx] // 2
2199
+ else:
2200
+ block_len = self.block_len_per_layer[layer_idx]
2201
+ return block_len
2202
+
2203
+ def get_kv_connector_stats(self) -> KVConnectorStats | None:
2204
+ """
2205
+ Get the KV transfer stats for the connector.
2206
+ """
2207
+ # Clear stats for next iteration
2208
+ if not self.xfer_stats.is_empty():
2209
+ return self.xfer_stats.clone_and_reset()
2210
+ return None
2211
+
2212
+ def get_block_ids_with_load_errors(self) -> set[int]:
2213
+ """
2214
+ Return and clear the set of block IDs that failed to load.
2215
+
2216
+ This is called by the scheduler to identify blocks that need
2217
+ to be retried after a NIXL transfer failure.
2218
+ """
2219
+ result = self._invalid_block_ids
2220
+ self._invalid_block_ids = set()
2221
+ return result
2222
+
2223
+ def __del__(self):
2224
+ self.shutdown()
2225
+
2226
+ def shutdown(self):
2227
+ """Shutdown the connector worker."""
2228
+ self._handshake_initiation_executor.shutdown(wait=False)
2229
+ for handles in self._recving_transfers.values():
2230
+ for handle, _ in handles:
2231
+ self.nixl_wrapper.release_xfer_handle(handle)
2232
+ self._recving_transfers.clear()
2233
+ if self.src_xfer_side_handle:
2234
+ self.nixl_wrapper.release_dlist_handle(self.src_xfer_side_handle)
2235
+ self.src_xfer_side_handle = 0
2236
+ for dst_xfer_side_handle in self.dst_xfer_side_handles.values():
2237
+ self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
2238
+ self.dst_xfer_side_handles.clear()
2239
+ for remote_agents in self._remote_agents.values():
2240
+ for agent_name in remote_agents.values():
2241
+ self.nixl_wrapper.remove_remote_agent(agent_name)
2242
+ self._remote_agents.clear()
2243
+ for desc in self._registered_descs:
2244
+ self.nixl_wrapper.deregister_memory(desc)
2245
+ self._registered_descs.clear()
2246
+
2247
+
2248
+ @contextlib.contextmanager
2249
+ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
2250
+ """Context manager for a ZMQ socket"""
2251
+
2252
+ if socket_type not in (zmq.ROUTER, zmq.REQ):
2253
+ raise ValueError(f"Unexpected socket type: {socket_type}")
2254
+
2255
+ ctx: zmq.Context | None = None
2256
+ try:
2257
+ ctx = zmq.Context() # type: ignore[attr-defined]
2258
+ yield make_zmq_socket(
2259
+ ctx=ctx, path=addr, socket_type=socket_type, bind=socket_type == zmq.ROUTER
2260
+ )
2261
+ finally:
2262
+ if ctx is not None:
2263
+ ctx.destroy(linger=0)
2264
+
2265
+
2266
+ @dataclass
2267
+ class NixlKVConnectorStats(KVConnectorStats):
2268
+ """Container for transfer performance metrics"""
2269
+
2270
+ def __post_init__(self):
2271
+ if not self.data:
2272
+ # Empty container init, no data is passed in.
2273
+ self.reset()
2274
+
2275
+ def reset(self):
2276
+ # Must be serializable
2277
+ self.data: dict[str, list[float]] = {
2278
+ "transfer_duration": [],
2279
+ "post_duration": [],
2280
+ "bytes_transferred": [],
2281
+ "num_descriptors": [],
2282
+ "num_failed_transfers": [],
2283
+ "num_failed_notifications": [],
2284
+ }
2285
+
2286
+ def record_transfer(self, res: nixlXferTelemetry):
2287
+ # Keep metrics units consistent with rest of the code: time us->s
2288
+ self.data["transfer_duration"].append(res.xferDuration / 1e6)
2289
+ self.data["post_duration"].append(res.postDuration / 1e6)
2290
+ self.data["bytes_transferred"].append(res.totalBytes)
2291
+ self.data["num_descriptors"].append(res.descCount)
2292
+
2293
+ def record_failed_transfer(self):
2294
+ """Record a failed NIXL transfer operation."""
2295
+ self.data["num_failed_transfers"].append(1.0)
2296
+
2297
+ def record_failed_notification(self):
2298
+ """Record a failed NIXL notification (send_notif)."""
2299
+ self.data["num_failed_notifications"].append(1.0)
2300
+
2301
+ def clone_and_reset(self) -> "NixlKVConnectorStats":
2302
+ old = copy.copy(self)
2303
+ self.reset()
2304
+ return old
2305
+
2306
+ def is_empty(self) -> bool:
2307
+ return self.num_successful_transfers == 0
2308
+
2309
+ def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
2310
+ if not other.is_empty():
2311
+ for k, v in other.data.items():
2312
+ accumulator = self.data[k]
2313
+ assert isinstance(accumulator, list)
2314
+ accumulator.extend(v)
2315
+ return self
2316
+
2317
+ def reduce(self) -> dict[str, int | float]:
2318
+ # Compute compact representative stats suitable for CLI logging
2319
+ if self.is_empty():
2320
+ return {
2321
+ "Num successful transfers": 0,
2322
+ "Avg xfer time (ms)": 0,
2323
+ "P90 xfer time (ms)": 0,
2324
+ "Avg post time (ms)": 0,
2325
+ "P90 post time (ms)": 0,
2326
+ "Avg MB per transfer": 0,
2327
+ "Throughput (MB/s)": 0,
2328
+ "Avg number of descriptors": 0,
2329
+ }
2330
+
2331
+ xfer_time = np.asarray(self.data["transfer_duration"])
2332
+ post_time = np.asarray(self.data["post_duration"])
2333
+ # Convert to MB for CLI logging.
2334
+ mb = np.asarray(self.data["bytes_transferred"]) / 2**20
2335
+ descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32)
2336
+ n = len(descs)
2337
+ assert n == self.num_successful_transfers
2338
+
2339
+ total_mb = mb.sum()
2340
+ avg_mb = total_mb / n
2341
+
2342
+ total_time_seconds = xfer_time.sum()
2343
+ throughput_mb_s = total_mb / total_time_seconds
2344
+
2345
+ return {
2346
+ "Num successful transfers": n,
2347
+ "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
2348
+ "P90 xfer time (ms)": round(np.percentile(xfer_time, 90).item() * 1e3, 3),
2349
+ "Avg post time (ms)": round(post_time.mean() * 1e3, 3),
2350
+ "P90 post time (ms)": round(np.percentile(post_time, 90).item() * 1e3, 3),
2351
+ "Avg MB per transfer": round(avg_mb, 3),
2352
+ "Throughput (MB/s)": round(throughput_mb_s, 3),
2353
+ "Avg number of descriptors": round(descs.mean(), 1),
2354
+ }
2355
+
2356
+ @property
2357
+ def num_successful_transfers(self) -> int:
2358
+ return len(self.data["transfer_duration"])
2359
+
2360
+
2361
+ class NixlPromMetrics(KVConnectorPromMetrics):
2362
+ def __init__(
2363
+ self,
2364
+ vllm_config: VllmConfig,
2365
+ metric_types: dict[type[PromMetric], type[PromMetricT]],
2366
+ labelnames: list[str],
2367
+ per_engine_labelvalues: dict[int, list[object]],
2368
+ ):
2369
+ super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
2370
+
2371
+ buckets = [
2372
+ 0.001,
2373
+ 0.005,
2374
+ 0.01,
2375
+ 0.025,
2376
+ 0.05,
2377
+ 0.075,
2378
+ 0.1,
2379
+ 0.2,
2380
+ 0.3,
2381
+ 0.5,
2382
+ 0.75,
2383
+ 1.0,
2384
+ 5.0,
2385
+ ]
2386
+ nixl_histogram_xfer_time = self._histogram_cls(
2387
+ name="vllm:nixl_xfer_time_seconds",
2388
+ documentation="Histogram of transfer duration for NIXL KV Cache transfers.",
2389
+ buckets=buckets[1:],
2390
+ labelnames=labelnames,
2391
+ )
2392
+ self.nixl_histogram_xfer_time = self.make_per_engine(nixl_histogram_xfer_time)
2393
+ nixl_histogram_post_time = self._histogram_cls(
2394
+ name="vllm:nixl_post_time_seconds",
2395
+ documentation="Histogram of transfer post time for NIXL KV"
2396
+ " Cache transfers.",
2397
+ buckets=buckets,
2398
+ labelnames=labelnames,
2399
+ )
2400
+ self.nixl_histogram_post_time = self.make_per_engine(nixl_histogram_post_time)
2401
+ # uniform 2kb to 16gb range
2402
+ buckets = [2 ** (10 + i) for i in range(1, 25, 2)]
2403
+ nixl_histogram_bytes_transferred = self._histogram_cls(
2404
+ name="vllm:nixl_bytes_transferred",
2405
+ documentation="Histogram of bytes transferred per NIXL KV Cache transfers.",
2406
+ buckets=buckets,
2407
+ labelnames=labelnames,
2408
+ )
2409
+ self.nixl_histogram_bytes_transferred = self.make_per_engine(
2410
+ nixl_histogram_bytes_transferred
2411
+ )
2412
+ buckets = [
2413
+ 10,
2414
+ 20,
2415
+ 30,
2416
+ 50,
2417
+ 75,
2418
+ 100,
2419
+ 200,
2420
+ 400,
2421
+ 1000,
2422
+ 2000,
2423
+ 4000,
2424
+ 10000,
2425
+ 20000,
2426
+ 50000,
2427
+ ]
2428
+ nixl_histogram_num_descriptors = self._histogram_cls(
2429
+ name="vllm:nixl_num_descriptors",
2430
+ documentation="Histogram of number of descriptors per NIXL"
2431
+ " KV Cache transfers.",
2432
+ buckets=buckets,
2433
+ labelnames=labelnames,
2434
+ )
2435
+ self.nixl_histogram_num_descriptors = self.make_per_engine(
2436
+ nixl_histogram_num_descriptors
2437
+ )
2438
+ counter_nixl_num_failed_transfers = self._counter_cls(
2439
+ name="vllm:nixl_num_failed_transfers",
2440
+ documentation="Number of failed NIXL KV Cache transfers.",
2441
+ labelnames=labelnames,
2442
+ )
2443
+ self.counter_nixl_num_failed_transfers = self.make_per_engine(
2444
+ counter_nixl_num_failed_transfers
2445
+ )
2446
+ counter_nixl_num_failed_notifications = self._counter_cls(
2447
+ name="vllm:nixl_num_failed_notifications",
2448
+ documentation="Number of failed NIXL KV Cache notifications.",
2449
+ labelnames=labelnames,
2450
+ )
2451
+ self.counter_nixl_num_failed_notifications = self.make_per_engine(
2452
+ counter_nixl_num_failed_notifications
2453
+ )
2454
+
2455
+ def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
2456
+ for prom_obj, list_item_key in zip(
2457
+ [
2458
+ self.nixl_histogram_xfer_time,
2459
+ self.nixl_histogram_post_time,
2460
+ self.nixl_histogram_bytes_transferred,
2461
+ self.nixl_histogram_num_descriptors,
2462
+ ],
2463
+ [
2464
+ "transfer_duration",
2465
+ "post_duration",
2466
+ "bytes_transferred",
2467
+ "num_descriptors",
2468
+ ],
2469
+ ):
2470
+ for list_item in transfer_stats_data[list_item_key]:
2471
+ prom_obj[engine_idx].observe(list_item)
2472
+ for counter_obj, counter_item_key in zip(
2473
+ [
2474
+ self.counter_nixl_num_failed_transfers,
2475
+ self.counter_nixl_num_failed_notifications,
2476
+ ],
2477
+ ["num_failed_transfers", "num_failed_notifications"],
2478
+ ):
2479
+ for list_item in transfer_stats_data[counter_item_key]:
2480
+ counter_obj[engine_idx].inc(list_item)