vllm-cpu-avx512bf16 0.14.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1712) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +1511 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +3206 -0
  6. vllm/_ipex_ops.py +445 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +62 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/layer.py +913 -0
  15. vllm/attention/utils/__init__.py +0 -0
  16. vllm/attention/utils/kv_sharing_utils.py +33 -0
  17. vllm/attention/utils/kv_transfer_utils.py +60 -0
  18. vllm/beam_search.py +88 -0
  19. vllm/benchmarks/__init__.py +0 -0
  20. vllm/benchmarks/datasets.py +3277 -0
  21. vllm/benchmarks/latency.py +172 -0
  22. vllm/benchmarks/lib/__init__.py +3 -0
  23. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  24. vllm/benchmarks/lib/ready_checker.py +72 -0
  25. vllm/benchmarks/lib/utils.py +79 -0
  26. vllm/benchmarks/mm_processor.py +363 -0
  27. vllm/benchmarks/serve.py +1761 -0
  28. vllm/benchmarks/startup.py +321 -0
  29. vllm/benchmarks/sweep/__init__.py +0 -0
  30. vllm/benchmarks/sweep/cli.py +41 -0
  31. vllm/benchmarks/sweep/param_sweep.py +159 -0
  32. vllm/benchmarks/sweep/plot.py +675 -0
  33. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  34. vllm/benchmarks/sweep/serve.py +450 -0
  35. vllm/benchmarks/sweep/serve_sla.py +459 -0
  36. vllm/benchmarks/sweep/server.py +114 -0
  37. vllm/benchmarks/sweep/sla_sweep.py +138 -0
  38. vllm/benchmarks/sweep/utils.py +4 -0
  39. vllm/benchmarks/throughput.py +946 -0
  40. vllm/collect_env.py +857 -0
  41. vllm/compilation/__init__.py +0 -0
  42. vllm/compilation/activation_quant_fusion.py +214 -0
  43. vllm/compilation/backends.py +840 -0
  44. vllm/compilation/base_static_graph.py +57 -0
  45. vllm/compilation/caching.py +196 -0
  46. vllm/compilation/collective_fusion.py +1224 -0
  47. vllm/compilation/compiler_interface.py +639 -0
  48. vllm/compilation/counter.py +50 -0
  49. vllm/compilation/cuda_graph.py +309 -0
  50. vllm/compilation/decorators.py +662 -0
  51. vllm/compilation/fix_functionalization.py +266 -0
  52. vllm/compilation/fusion.py +570 -0
  53. vllm/compilation/fusion_attn.py +363 -0
  54. vllm/compilation/fx_utils.py +92 -0
  55. vllm/compilation/inductor_pass.py +145 -0
  56. vllm/compilation/matcher_utils.py +454 -0
  57. vllm/compilation/monitor.py +62 -0
  58. vllm/compilation/noop_elimination.py +130 -0
  59. vllm/compilation/partition_rules.py +75 -0
  60. vllm/compilation/pass_manager.py +164 -0
  61. vllm/compilation/piecewise_backend.py +191 -0
  62. vllm/compilation/post_cleanup.py +21 -0
  63. vllm/compilation/qk_norm_rope_fusion.py +244 -0
  64. vllm/compilation/rocm_aiter_fusion.py +401 -0
  65. vllm/compilation/sequence_parallelism.py +368 -0
  66. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  67. vllm/compilation/vllm_inductor_pass.py +180 -0
  68. vllm/compilation/wrapper.py +329 -0
  69. vllm/config/__init__.py +112 -0
  70. vllm/config/attention.py +114 -0
  71. vllm/config/cache.py +233 -0
  72. vllm/config/compilation.py +1149 -0
  73. vllm/config/device.py +75 -0
  74. vllm/config/ec_transfer.py +110 -0
  75. vllm/config/kv_events.py +56 -0
  76. vllm/config/kv_transfer.py +119 -0
  77. vllm/config/load.py +124 -0
  78. vllm/config/lora.py +102 -0
  79. vllm/config/model.py +2026 -0
  80. vllm/config/model_arch.py +57 -0
  81. vllm/config/multimodal.py +247 -0
  82. vllm/config/observability.py +157 -0
  83. vllm/config/parallel.py +703 -0
  84. vllm/config/pooler.py +188 -0
  85. vllm/config/profiler.py +199 -0
  86. vllm/config/scheduler.py +298 -0
  87. vllm/config/speculative.py +656 -0
  88. vllm/config/speech_to_text.py +39 -0
  89. vllm/config/structured_outputs.py +78 -0
  90. vllm/config/utils.py +374 -0
  91. vllm/config/vllm.py +1487 -0
  92. vllm/connections.py +189 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +301 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +43 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +509 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +303 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +346 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +190 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  106. vllm/distributed/device_communicators/pynccl.py +386 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +567 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +778 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  113. vllm/distributed/device_communicators/symm_mem.py +156 -0
  114. vllm/distributed/device_communicators/xpu_communicator.py +98 -0
  115. vllm/distributed/ec_transfer/__init__.py +14 -0
  116. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  117. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  118. vllm/distributed/ec_transfer/ec_connector/example_connector.py +201 -0
  119. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  120. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  121. vllm/distributed/eplb/__init__.py +3 -0
  122. vllm/distributed/eplb/async_worker.py +115 -0
  123. vllm/distributed/eplb/eplb_state.py +1192 -0
  124. vllm/distributed/eplb/policy/__init__.py +19 -0
  125. vllm/distributed/eplb/policy/abstract.py +43 -0
  126. vllm/distributed/eplb/policy/default.py +376 -0
  127. vllm/distributed/eplb/rebalance_execute.py +699 -0
  128. vllm/distributed/kv_events.py +505 -0
  129. vllm/distributed/kv_transfer/README.md +29 -0
  130. vllm/distributed/kv_transfer/__init__.py +20 -0
  131. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  132. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  133. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  134. vllm/distributed/kv_transfer/kv_connector/factory.py +203 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +459 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +607 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +450 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +344 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  142. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +395 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +211 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1431 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +941 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +186 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +916 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/moriio/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +321 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +1515 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +609 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +477 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2688 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +557 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  159. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  160. vllm/distributed/parallel_state.py +1809 -0
  161. vllm/distributed/utils.py +545 -0
  162. vllm/engine/__init__.py +0 -0
  163. vllm/engine/arg_utils.py +2137 -0
  164. vllm/engine/async_llm_engine.py +6 -0
  165. vllm/engine/llm_engine.py +6 -0
  166. vllm/engine/protocol.py +194 -0
  167. vllm/entrypoints/__init__.py +0 -0
  168. vllm/entrypoints/anthropic/__init__.py +0 -0
  169. vllm/entrypoints/anthropic/protocol.py +162 -0
  170. vllm/entrypoints/anthropic/serving_messages.py +468 -0
  171. vllm/entrypoints/api_server.py +186 -0
  172. vllm/entrypoints/chat_utils.py +1912 -0
  173. vllm/entrypoints/cli/__init__.py +19 -0
  174. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/base.py +25 -0
  176. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  177. vllm/entrypoints/cli/benchmark/main.py +57 -0
  178. vllm/entrypoints/cli/benchmark/mm_processor.py +21 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  180. vllm/entrypoints/cli/benchmark/startup.py +21 -0
  181. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  182. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  183. vllm/entrypoints/cli/collect_env.py +38 -0
  184. vllm/entrypoints/cli/main.py +79 -0
  185. vllm/entrypoints/cli/openai.py +260 -0
  186. vllm/entrypoints/cli/run_batch.py +68 -0
  187. vllm/entrypoints/cli/serve.py +253 -0
  188. vllm/entrypoints/cli/types.py +29 -0
  189. vllm/entrypoints/constants.py +12 -0
  190. vllm/entrypoints/context.py +898 -0
  191. vllm/entrypoints/grpc_server.py +531 -0
  192. vllm/entrypoints/launcher.py +175 -0
  193. vllm/entrypoints/llm.py +1807 -0
  194. vllm/entrypoints/logger.py +86 -0
  195. vllm/entrypoints/openai/__init__.py +0 -0
  196. vllm/entrypoints/openai/api_server.py +1390 -0
  197. vllm/entrypoints/openai/cli_args.py +320 -0
  198. vllm/entrypoints/openai/orca_metrics.py +120 -0
  199. vllm/entrypoints/openai/parser/__init__.py +0 -0
  200. vllm/entrypoints/openai/parser/harmony_utils.py +820 -0
  201. vllm/entrypoints/openai/parser/responses_parser.py +176 -0
  202. vllm/entrypoints/openai/protocol.py +2566 -0
  203. vllm/entrypoints/openai/run_batch.py +635 -0
  204. vllm/entrypoints/openai/serving_chat.py +1897 -0
  205. vllm/entrypoints/openai/serving_chat_stream_harmony.py +101 -0
  206. vllm/entrypoints/openai/serving_completion.py +740 -0
  207. vllm/entrypoints/openai/serving_engine.py +1612 -0
  208. vllm/entrypoints/openai/serving_models.py +309 -0
  209. vllm/entrypoints/openai/serving_responses.py +2552 -0
  210. vllm/entrypoints/openai/serving_transcription.py +168 -0
  211. vllm/entrypoints/openai/speech_to_text.py +711 -0
  212. vllm/entrypoints/openai/utils.py +49 -0
  213. vllm/entrypoints/pooling/__init__.py +16 -0
  214. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  215. vllm/entrypoints/pooling/classify/api_router.py +48 -0
  216. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  217. vllm/entrypoints/pooling/classify/serving.py +233 -0
  218. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  219. vllm/entrypoints/pooling/embed/api_router.py +65 -0
  220. vllm/entrypoints/pooling/embed/conftest.py +28 -0
  221. vllm/entrypoints/pooling/embed/protocol.py +217 -0
  222. vllm/entrypoints/pooling/embed/serving.py +684 -0
  223. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  224. vllm/entrypoints/pooling/pooling/api_router.py +62 -0
  225. vllm/entrypoints/pooling/pooling/protocol.py +146 -0
  226. vllm/entrypoints/pooling/pooling/serving.py +354 -0
  227. vllm/entrypoints/pooling/score/__init__.py +0 -0
  228. vllm/entrypoints/pooling/score/api_router.py +147 -0
  229. vllm/entrypoints/pooling/score/protocol.py +146 -0
  230. vllm/entrypoints/pooling/score/serving.py +511 -0
  231. vllm/entrypoints/renderer.py +411 -0
  232. vllm/entrypoints/responses_utils.py +218 -0
  233. vllm/entrypoints/sagemaker/__init__.py +4 -0
  234. vllm/entrypoints/sagemaker/routes.py +118 -0
  235. vllm/entrypoints/score_utils.py +271 -0
  236. vllm/entrypoints/serve/__init__.py +94 -0
  237. vllm/entrypoints/serve/cache/__init__.py +0 -0
  238. vllm/entrypoints/serve/cache/api_router.py +61 -0
  239. vllm/entrypoints/serve/disagg/__init__.py +0 -0
  240. vllm/entrypoints/serve/disagg/api_router.py +109 -0
  241. vllm/entrypoints/serve/disagg/protocol.py +90 -0
  242. vllm/entrypoints/serve/disagg/serving.py +285 -0
  243. vllm/entrypoints/serve/elastic_ep/__init__.py +0 -0
  244. vllm/entrypoints/serve/elastic_ep/api_router.py +96 -0
  245. vllm/entrypoints/serve/elastic_ep/middleware.py +49 -0
  246. vllm/entrypoints/serve/instrumentator/__init__.py +0 -0
  247. vllm/entrypoints/serve/instrumentator/health.py +33 -0
  248. vllm/entrypoints/serve/instrumentator/metrics.py +45 -0
  249. vllm/entrypoints/serve/instrumentator/offline_docs.py +50 -0
  250. vllm/entrypoints/serve/instrumentator/server_info.py +56 -0
  251. vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js +2 -0
  252. vllm/entrypoints/serve/instrumentator/static/swagger-ui.css +3 -0
  253. vllm/entrypoints/serve/lora/__init__.py +0 -0
  254. vllm/entrypoints/serve/lora/api_router.py +70 -0
  255. vllm/entrypoints/serve/profile/__init__.py +0 -0
  256. vllm/entrypoints/serve/profile/api_router.py +46 -0
  257. vllm/entrypoints/serve/rlhf/__init__.py +0 -0
  258. vllm/entrypoints/serve/rlhf/api_router.py +102 -0
  259. vllm/entrypoints/serve/rpc/__init__.py +0 -0
  260. vllm/entrypoints/serve/rpc/api_router.py +61 -0
  261. vllm/entrypoints/serve/sleep/__init__.py +0 -0
  262. vllm/entrypoints/serve/sleep/api_router.py +56 -0
  263. vllm/entrypoints/serve/tokenize/__init__.py +0 -0
  264. vllm/entrypoints/serve/tokenize/api_router.py +112 -0
  265. vllm/entrypoints/serve/tokenize/serving.py +204 -0
  266. vllm/entrypoints/ssl.py +78 -0
  267. vllm/entrypoints/tool.py +187 -0
  268. vllm/entrypoints/tool_server.py +234 -0
  269. vllm/entrypoints/utils.py +336 -0
  270. vllm/env_override.py +402 -0
  271. vllm/envs.py +1791 -0
  272. vllm/exceptions.py +36 -0
  273. vllm/forward_context.py +375 -0
  274. vllm/grpc/__init__.py +17 -0
  275. vllm/grpc/compile_protos.py +94 -0
  276. vllm/grpc/vllm_engine.proto +195 -0
  277. vllm/grpc/vllm_engine_pb2.py +77 -0
  278. vllm/grpc/vllm_engine_pb2.pyi +213 -0
  279. vllm/grpc/vllm_engine_pb2_grpc.py +330 -0
  280. vllm/inputs/__init__.py +44 -0
  281. vllm/inputs/data.py +359 -0
  282. vllm/inputs/parse.py +147 -0
  283. vllm/inputs/preprocess.py +716 -0
  284. vllm/logger.py +303 -0
  285. vllm/logging_utils/__init__.py +13 -0
  286. vllm/logging_utils/dump_input.py +83 -0
  287. vllm/logging_utils/formatter.py +127 -0
  288. vllm/logging_utils/lazy.py +20 -0
  289. vllm/logging_utils/log_time.py +34 -0
  290. vllm/logits_process.py +121 -0
  291. vllm/logprobs.py +206 -0
  292. vllm/lora/__init__.py +0 -0
  293. vllm/lora/layers/__init__.py +43 -0
  294. vllm/lora/layers/base.py +66 -0
  295. vllm/lora/layers/base_linear.py +172 -0
  296. vllm/lora/layers/column_parallel_linear.py +577 -0
  297. vllm/lora/layers/fused_moe.py +739 -0
  298. vllm/lora/layers/logits_processor.py +203 -0
  299. vllm/lora/layers/replicated_linear.py +70 -0
  300. vllm/lora/layers/row_parallel_linear.py +176 -0
  301. vllm/lora/layers/utils.py +115 -0
  302. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  303. vllm/lora/lora_model.py +221 -0
  304. vllm/lora/lora_weights.py +227 -0
  305. vllm/lora/model_manager.py +858 -0
  306. vllm/lora/ops/__init__.py +0 -0
  307. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  308. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  309. vllm/lora/ops/torch_ops/__init__.py +20 -0
  310. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  311. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  312. vllm/lora/ops/triton_ops/__init__.py +21 -0
  313. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +677 -0
  314. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  315. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  316. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  317. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  318. vllm/lora/ops/triton_ops/utils.py +313 -0
  319. vllm/lora/peft_helper.py +128 -0
  320. vllm/lora/punica_wrapper/__init__.py +10 -0
  321. vllm/lora/punica_wrapper/punica_base.py +493 -0
  322. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  323. vllm/lora/punica_wrapper/punica_gpu.py +413 -0
  324. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  325. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  326. vllm/lora/punica_wrapper/utils.py +150 -0
  327. vllm/lora/request.py +60 -0
  328. vllm/lora/resolver.py +88 -0
  329. vllm/lora/utils.py +281 -0
  330. vllm/lora/worker_manager.py +278 -0
  331. vllm/model_executor/__init__.py +9 -0
  332. vllm/model_executor/custom_op.py +203 -0
  333. vllm/model_executor/layers/__init__.py +0 -0
  334. vllm/model_executor/layers/activation.py +628 -0
  335. vllm/model_executor/layers/attention/__init__.py +0 -0
  336. vllm/model_executor/layers/attention/chunked_local_attention.py +130 -0
  337. vllm/model_executor/layers/attention/cross_attention.py +182 -0
  338. vllm/model_executor/layers/attention/encoder_only_attention.py +103 -0
  339. vllm/model_executor/layers/attention/mm_encoder_attention.py +234 -0
  340. vllm/model_executor/layers/attention/static_sink_attention.py +254 -0
  341. vllm/model_executor/layers/attention_layer_base.py +34 -0
  342. vllm/model_executor/layers/batch_invariant.py +1063 -0
  343. vllm/model_executor/layers/conv.py +262 -0
  344. vllm/model_executor/layers/fla/__init__.py +8 -0
  345. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  346. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  347. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  348. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  349. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  350. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  351. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  352. vllm/model_executor/layers/fla/ops/index.py +41 -0
  353. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  354. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  355. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  356. vllm/model_executor/layers/fla/ops/op.py +60 -0
  357. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  358. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  359. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  360. vllm/model_executor/layers/fused_moe/__init__.py +120 -0
  361. vllm/model_executor/layers/fused_moe/all2all_utils.py +173 -0
  362. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +411 -0
  363. vllm/model_executor/layers/fused_moe/config.py +1111 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json +147 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  645. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  646. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  647. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  648. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  649. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  650. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  651. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +444 -0
  652. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1086 -0
  653. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +364 -0
  654. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +427 -0
  655. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  656. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +436 -0
  657. vllm/model_executor/layers/fused_moe/fallback.py +127 -0
  658. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +338 -0
  659. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +310 -0
  660. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +371 -0
  661. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  662. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1018 -0
  663. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +824 -0
  664. vllm/model_executor/layers/fused_moe/fused_moe.py +2638 -0
  665. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +119 -0
  666. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +117 -0
  667. vllm/model_executor/layers/fused_moe/fused_moe_router.py +40 -0
  668. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +531 -0
  669. vllm/model_executor/layers/fused_moe/layer.py +2169 -0
  670. vllm/model_executor/layers/fused_moe/modular_kernel.py +1251 -0
  671. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +192 -0
  672. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  673. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  674. vllm/model_executor/layers/fused_moe/oracle/__init__.py +2 -0
  675. vllm/model_executor/layers/fused_moe/oracle/fp8.py +358 -0
  676. vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +280 -0
  677. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  678. vllm/model_executor/layers/fused_moe/prepare_finalize.py +87 -0
  679. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +347 -0
  680. vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +324 -0
  681. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  682. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  683. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  684. vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py +78 -0
  685. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +75 -0
  686. vllm/model_executor/layers/fused_moe/trtllm_moe.py +144 -0
  687. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +403 -0
  688. vllm/model_executor/layers/fused_moe/utils.py +382 -0
  689. vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +189 -0
  690. vllm/model_executor/layers/kda.py +442 -0
  691. vllm/model_executor/layers/layernorm.py +451 -0
  692. vllm/model_executor/layers/lightning_attn.py +735 -0
  693. vllm/model_executor/layers/linear.py +1478 -0
  694. vllm/model_executor/layers/logits_processor.py +109 -0
  695. vllm/model_executor/layers/mamba/__init__.py +0 -0
  696. vllm/model_executor/layers/mamba/abstract.py +68 -0
  697. vllm/model_executor/layers/mamba/linear_attn.py +410 -0
  698. vllm/model_executor/layers/mamba/mamba_mixer.py +541 -0
  699. vllm/model_executor/layers/mamba/mamba_mixer2.py +936 -0
  700. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  701. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  702. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  703. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  704. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +586 -0
  705. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  706. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  707. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  708. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  709. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  710. vllm/model_executor/layers/mamba/short_conv.py +254 -0
  711. vllm/model_executor/layers/mla.py +179 -0
  712. vllm/model_executor/layers/pooler/__init__.py +5 -0
  713. vllm/model_executor/layers/pooler/abstract.py +39 -0
  714. vllm/model_executor/layers/pooler/activations.py +162 -0
  715. vllm/model_executor/layers/pooler/common.py +32 -0
  716. vllm/model_executor/layers/pooler/seqwise/__init__.py +45 -0
  717. vllm/model_executor/layers/pooler/seqwise/heads.py +151 -0
  718. vllm/model_executor/layers/pooler/seqwise/methods.py +93 -0
  719. vllm/model_executor/layers/pooler/seqwise/poolers.py +127 -0
  720. vllm/model_executor/layers/pooler/special.py +128 -0
  721. vllm/model_executor/layers/pooler/tokwise/__init__.py +39 -0
  722. vllm/model_executor/layers/pooler/tokwise/heads.py +133 -0
  723. vllm/model_executor/layers/pooler/tokwise/methods.py +122 -0
  724. vllm/model_executor/layers/pooler/tokwise/poolers.py +127 -0
  725. vllm/model_executor/layers/quantization/__init__.py +195 -0
  726. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  727. vllm/model_executor/layers/quantization/awq.py +277 -0
  728. vllm/model_executor/layers/quantization/awq_marlin.py +795 -0
  729. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  730. vllm/model_executor/layers/quantization/base_config.py +170 -0
  731. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  732. vllm/model_executor/layers/quantization/bitsandbytes.py +631 -0
  733. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  734. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +982 -0
  735. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2368 -0
  736. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +37 -0
  737. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  738. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  739. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  740. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py +106 -0
  741. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  742. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  743. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +176 -0
  744. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  745. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  746. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +203 -0
  747. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  748. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  749. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  750. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  751. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  752. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  753. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  754. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  755. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  756. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  757. vllm/model_executor/layers/quantization/cpu_wna16.py +299 -0
  758. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  759. vllm/model_executor/layers/quantization/experts_int8.py +209 -0
  760. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  761. vllm/model_executor/layers/quantization/fp8.py +1224 -0
  762. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  763. vllm/model_executor/layers/quantization/gguf.py +682 -0
  764. vllm/model_executor/layers/quantization/gptq.py +393 -0
  765. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  766. vllm/model_executor/layers/quantization/gptq_marlin.py +934 -0
  767. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  768. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  769. vllm/model_executor/layers/quantization/inc.py +65 -0
  770. vllm/model_executor/layers/quantization/input_quant_fp8.py +212 -0
  771. vllm/model_executor/layers/quantization/ipex_quant.py +403 -0
  772. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  773. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  774. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +113 -0
  775. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  776. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  777. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  778. vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py +126 -0
  779. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +130 -0
  780. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  781. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +168 -0
  782. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  783. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  784. vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py +97 -0
  785. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +76 -0
  786. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +77 -0
  787. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +128 -0
  788. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +220 -0
  789. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +147 -0
  790. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +88 -0
  791. vllm/model_executor/layers/quantization/kv_cache.py +153 -0
  792. vllm/model_executor/layers/quantization/modelopt.py +1665 -0
  793. vllm/model_executor/layers/quantization/moe_wna16.py +518 -0
  794. vllm/model_executor/layers/quantization/mxfp4.py +1145 -0
  795. vllm/model_executor/layers/quantization/petit.py +319 -0
  796. vllm/model_executor/layers/quantization/ptpc_fp8.py +140 -0
  797. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  798. vllm/model_executor/layers/quantization/quark/quark.py +570 -0
  799. vllm/model_executor/layers/quantization/quark/quark_moe.py +797 -0
  800. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  801. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  802. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  803. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  804. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  805. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  806. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  807. vllm/model_executor/layers/quantization/rtn.py +626 -0
  808. vllm/model_executor/layers/quantization/schema.py +90 -0
  809. vllm/model_executor/layers/quantization/torchao.py +380 -0
  810. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  811. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  812. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1002. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1003. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1004. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1005. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1006. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1007. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1008. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1009. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1010. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1011. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1012. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1013. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1014. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1015. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1016. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1017. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1018. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1019. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1020. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1021. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1022. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1023. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1024. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1025. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1026. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1027. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1028. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +514 -0
  1029. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +370 -0
  1030. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1658 -0
  1031. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1032. vllm/model_executor/layers/quantization/utils/int8_utils.py +477 -0
  1033. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1034. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1035. vllm/model_executor/layers/quantization/utils/marlin_utils.py +720 -0
  1036. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +565 -0
  1037. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1038. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1039. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1040. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +189 -0
  1041. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1042. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1043. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1044. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1045. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1046. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1047. vllm/model_executor/layers/quantization/utils/quant_utils.py +767 -0
  1048. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +519 -0
  1049. vllm/model_executor/layers/resampler.py +283 -0
  1050. vllm/model_executor/layers/rotary_embedding/__init__.py +291 -0
  1051. vllm/model_executor/layers/rotary_embedding/base.py +282 -0
  1052. vllm/model_executor/layers/rotary_embedding/common.py +289 -0
  1053. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +184 -0
  1054. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +218 -0
  1055. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1056. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1057. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +82 -0
  1058. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1059. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1060. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +83 -0
  1061. vllm/model_executor/layers/rotary_embedding/mrope.py +412 -0
  1062. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1063. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1064. vllm/model_executor/layers/rotary_embedding/xdrope.py +160 -0
  1065. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1066. vllm/model_executor/layers/utils.py +251 -0
  1067. vllm/model_executor/layers/vocab_parallel_embedding.py +564 -0
  1068. vllm/model_executor/model_loader/__init__.py +150 -0
  1069. vllm/model_executor/model_loader/base_loader.py +71 -0
  1070. vllm/model_executor/model_loader/bitsandbytes_loader.py +821 -0
  1071. vllm/model_executor/model_loader/default_loader.py +304 -0
  1072. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1073. vllm/model_executor/model_loader/gguf_loader.py +371 -0
  1074. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1075. vllm/model_executor/model_loader/runai_streamer_loader.py +115 -0
  1076. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1077. vllm/model_executor/model_loader/tensorizer.py +793 -0
  1078. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1079. vllm/model_executor/model_loader/utils.py +299 -0
  1080. vllm/model_executor/model_loader/weight_utils.py +1183 -0
  1081. vllm/model_executor/models/__init__.py +44 -0
  1082. vllm/model_executor/models/adapters.py +592 -0
  1083. vllm/model_executor/models/afmoe.py +697 -0
  1084. vllm/model_executor/models/aimv2.py +248 -0
  1085. vllm/model_executor/models/apertus.py +567 -0
  1086. vllm/model_executor/models/arcee.py +428 -0
  1087. vllm/model_executor/models/arctic.py +633 -0
  1088. vllm/model_executor/models/aria.py +663 -0
  1089. vllm/model_executor/models/audioflamingo3.py +639 -0
  1090. vllm/model_executor/models/aya_vision.py +448 -0
  1091. vllm/model_executor/models/bagel.py +591 -0
  1092. vllm/model_executor/models/baichuan.py +493 -0
  1093. vllm/model_executor/models/bailing_moe.py +643 -0
  1094. vllm/model_executor/models/bamba.py +511 -0
  1095. vllm/model_executor/models/bee.py +157 -0
  1096. vllm/model_executor/models/bert.py +911 -0
  1097. vllm/model_executor/models/bert_with_rope.py +729 -0
  1098. vllm/model_executor/models/blip.py +350 -0
  1099. vllm/model_executor/models/blip2.py +736 -0
  1100. vllm/model_executor/models/bloom.py +390 -0
  1101. vllm/model_executor/models/chameleon.py +1095 -0
  1102. vllm/model_executor/models/chatglm.py +502 -0
  1103. vllm/model_executor/models/clip.py +1045 -0
  1104. vllm/model_executor/models/cohere2_vision.py +470 -0
  1105. vllm/model_executor/models/commandr.py +469 -0
  1106. vllm/model_executor/models/config.py +571 -0
  1107. vllm/model_executor/models/dbrx.py +484 -0
  1108. vllm/model_executor/models/deepencoder.py +679 -0
  1109. vllm/model_executor/models/deepseek_eagle.py +253 -0
  1110. vllm/model_executor/models/deepseek_mtp.py +447 -0
  1111. vllm/model_executor/models/deepseek_ocr.py +601 -0
  1112. vllm/model_executor/models/deepseek_v2.py +1727 -0
  1113. vllm/model_executor/models/deepseek_vl2.py +642 -0
  1114. vllm/model_executor/models/dots1.py +566 -0
  1115. vllm/model_executor/models/dots_ocr.py +830 -0
  1116. vllm/model_executor/models/ernie45.py +53 -0
  1117. vllm/model_executor/models/ernie45_moe.py +755 -0
  1118. vllm/model_executor/models/ernie45_vl.py +1702 -0
  1119. vllm/model_executor/models/ernie45_vl_moe.py +801 -0
  1120. vllm/model_executor/models/ernie_mtp.py +278 -0
  1121. vllm/model_executor/models/exaone.py +524 -0
  1122. vllm/model_executor/models/exaone4.py +518 -0
  1123. vllm/model_executor/models/exaone_moe.py +579 -0
  1124. vllm/model_executor/models/exaone_moe_mtp.py +255 -0
  1125. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1126. vllm/model_executor/models/falcon.py +543 -0
  1127. vllm/model_executor/models/falcon_h1.py +675 -0
  1128. vllm/model_executor/models/flex_olmo.py +155 -0
  1129. vllm/model_executor/models/fuyu.py +371 -0
  1130. vllm/model_executor/models/gemma.py +425 -0
  1131. vllm/model_executor/models/gemma2.py +435 -0
  1132. vllm/model_executor/models/gemma3.py +520 -0
  1133. vllm/model_executor/models/gemma3_mm.py +664 -0
  1134. vllm/model_executor/models/gemma3n.py +1166 -0
  1135. vllm/model_executor/models/gemma3n_audio_utils.py +57 -0
  1136. vllm/model_executor/models/gemma3n_mm.py +820 -0
  1137. vllm/model_executor/models/glm.py +24 -0
  1138. vllm/model_executor/models/glm4.py +295 -0
  1139. vllm/model_executor/models/glm4_1v.py +1823 -0
  1140. vllm/model_executor/models/glm4_moe.py +725 -0
  1141. vllm/model_executor/models/glm4_moe_mtp.py +365 -0
  1142. vllm/model_executor/models/glm4v.py +783 -0
  1143. vllm/model_executor/models/glmasr.py +1154 -0
  1144. vllm/model_executor/models/glmasr_utils.py +188 -0
  1145. vllm/model_executor/models/gpt2.py +385 -0
  1146. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1147. vllm/model_executor/models/gpt_j.py +346 -0
  1148. vllm/model_executor/models/gpt_neox.py +340 -0
  1149. vllm/model_executor/models/gpt_oss.py +745 -0
  1150. vllm/model_executor/models/granite.py +475 -0
  1151. vllm/model_executor/models/granite_speech.py +919 -0
  1152. vllm/model_executor/models/granitemoe.py +561 -0
  1153. vllm/model_executor/models/granitemoehybrid.py +703 -0
  1154. vllm/model_executor/models/granitemoeshared.py +328 -0
  1155. vllm/model_executor/models/gritlm.py +242 -0
  1156. vllm/model_executor/models/grok1.py +803 -0
  1157. vllm/model_executor/models/h2ovl.py +554 -0
  1158. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1159. vllm/model_executor/models/hunyuan_vision.py +1034 -0
  1160. vllm/model_executor/models/hyperclovax_vision.py +1163 -0
  1161. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1162. vllm/model_executor/models/idefics3.py +734 -0
  1163. vllm/model_executor/models/interfaces.py +1180 -0
  1164. vllm/model_executor/models/interfaces_base.py +252 -0
  1165. vllm/model_executor/models/intern_vit.py +454 -0
  1166. vllm/model_executor/models/internlm2.py +451 -0
  1167. vllm/model_executor/models/internlm2_ve.py +139 -0
  1168. vllm/model_executor/models/interns1.py +828 -0
  1169. vllm/model_executor/models/interns1_vit.py +433 -0
  1170. vllm/model_executor/models/internvl.py +1436 -0
  1171. vllm/model_executor/models/iquest_loopcoder.py +595 -0
  1172. vllm/model_executor/models/isaac.py +1503 -0
  1173. vllm/model_executor/models/jais.py +397 -0
  1174. vllm/model_executor/models/jais2.py +508 -0
  1175. vllm/model_executor/models/jamba.py +599 -0
  1176. vllm/model_executor/models/jina_vl.py +145 -0
  1177. vllm/model_executor/models/kanana_v.py +756 -0
  1178. vllm/model_executor/models/keye.py +1709 -0
  1179. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1180. vllm/model_executor/models/kimi_linear.py +659 -0
  1181. vllm/model_executor/models/kimi_vl.py +577 -0
  1182. vllm/model_executor/models/lfm2.py +515 -0
  1183. vllm/model_executor/models/lfm2_moe.py +746 -0
  1184. vllm/model_executor/models/lfm2_vl.py +732 -0
  1185. vllm/model_executor/models/lightonocr.py +197 -0
  1186. vllm/model_executor/models/llama.py +724 -0
  1187. vllm/model_executor/models/llama4.py +860 -0
  1188. vllm/model_executor/models/llama4_eagle.py +225 -0
  1189. vllm/model_executor/models/llama_eagle.py +213 -0
  1190. vllm/model_executor/models/llama_eagle3.py +375 -0
  1191. vllm/model_executor/models/llava.py +879 -0
  1192. vllm/model_executor/models/llava_next.py +583 -0
  1193. vllm/model_executor/models/llava_next_video.py +467 -0
  1194. vllm/model_executor/models/llava_onevision.py +922 -0
  1195. vllm/model_executor/models/longcat_flash.py +767 -0
  1196. vllm/model_executor/models/longcat_flash_mtp.py +348 -0
  1197. vllm/model_executor/models/mamba.py +276 -0
  1198. vllm/model_executor/models/mamba2.py +288 -0
  1199. vllm/model_executor/models/medusa.py +179 -0
  1200. vllm/model_executor/models/midashenglm.py +826 -0
  1201. vllm/model_executor/models/mimo.py +188 -0
  1202. vllm/model_executor/models/mimo_mtp.py +294 -0
  1203. vllm/model_executor/models/mimo_v2_flash.py +718 -0
  1204. vllm/model_executor/models/minicpm.py +660 -0
  1205. vllm/model_executor/models/minicpm3.py +233 -0
  1206. vllm/model_executor/models/minicpm_eagle.py +386 -0
  1207. vllm/model_executor/models/minicpmo.py +768 -0
  1208. vllm/model_executor/models/minicpmv.py +1742 -0
  1209. vllm/model_executor/models/minimax_m2.py +552 -0
  1210. vllm/model_executor/models/minimax_text_01.py +1008 -0
  1211. vllm/model_executor/models/minimax_vl_01.py +395 -0
  1212. vllm/model_executor/models/mistral3.py +638 -0
  1213. vllm/model_executor/models/mistral_large_3.py +63 -0
  1214. vllm/model_executor/models/mistral_large_3_eagle.py +137 -0
  1215. vllm/model_executor/models/mixtral.py +599 -0
  1216. vllm/model_executor/models/mllama4.py +1170 -0
  1217. vllm/model_executor/models/mlp_speculator.py +235 -0
  1218. vllm/model_executor/models/modernbert.py +458 -0
  1219. vllm/model_executor/models/module_mapping.py +74 -0
  1220. vllm/model_executor/models/molmo.py +1592 -0
  1221. vllm/model_executor/models/moonvit.py +601 -0
  1222. vllm/model_executor/models/mpt.py +335 -0
  1223. vllm/model_executor/models/nano_nemotron_vl.py +1725 -0
  1224. vllm/model_executor/models/nemotron.py +499 -0
  1225. vllm/model_executor/models/nemotron_h.py +902 -0
  1226. vllm/model_executor/models/nemotron_nas.py +474 -0
  1227. vllm/model_executor/models/nemotron_parse.py +958 -0
  1228. vllm/model_executor/models/nemotron_vl.py +651 -0
  1229. vllm/model_executor/models/nvlm_d.py +216 -0
  1230. vllm/model_executor/models/olmo.py +412 -0
  1231. vllm/model_executor/models/olmo2.py +454 -0
  1232. vllm/model_executor/models/olmoe.py +498 -0
  1233. vllm/model_executor/models/opencua.py +262 -0
  1234. vllm/model_executor/models/openpangu.py +1378 -0
  1235. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1236. vllm/model_executor/models/opt.py +426 -0
  1237. vllm/model_executor/models/orion.py +365 -0
  1238. vllm/model_executor/models/ouro.py +507 -0
  1239. vllm/model_executor/models/ovis.py +557 -0
  1240. vllm/model_executor/models/ovis2_5.py +661 -0
  1241. vllm/model_executor/models/paddleocr_vl.py +1261 -0
  1242. vllm/model_executor/models/paligemma.py +429 -0
  1243. vllm/model_executor/models/persimmon.py +373 -0
  1244. vllm/model_executor/models/phi.py +363 -0
  1245. vllm/model_executor/models/phi3.py +18 -0
  1246. vllm/model_executor/models/phi3v.py +729 -0
  1247. vllm/model_executor/models/phi4mm.py +1250 -0
  1248. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1249. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1250. vllm/model_executor/models/phimoe.py +671 -0
  1251. vllm/model_executor/models/pixtral.py +1437 -0
  1252. vllm/model_executor/models/plamo2.py +993 -0
  1253. vllm/model_executor/models/plamo3.py +437 -0
  1254. vllm/model_executor/models/qwen.py +377 -0
  1255. vllm/model_executor/models/qwen2.py +600 -0
  1256. vllm/model_executor/models/qwen2_5_omni_thinker.py +1200 -0
  1257. vllm/model_executor/models/qwen2_5_vl.py +1598 -0
  1258. vllm/model_executor/models/qwen2_audio.py +478 -0
  1259. vllm/model_executor/models/qwen2_moe.py +604 -0
  1260. vllm/model_executor/models/qwen2_rm.py +120 -0
  1261. vllm/model_executor/models/qwen2_vl.py +1588 -0
  1262. vllm/model_executor/models/qwen3.py +331 -0
  1263. vllm/model_executor/models/qwen3_moe.py +752 -0
  1264. vllm/model_executor/models/qwen3_next.py +1410 -0
  1265. vllm/model_executor/models/qwen3_next_mtp.py +293 -0
  1266. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1814 -0
  1267. vllm/model_executor/models/qwen3_vl.py +2120 -0
  1268. vllm/model_executor/models/qwen3_vl_moe.py +474 -0
  1269. vllm/model_executor/models/qwen_vl.py +821 -0
  1270. vllm/model_executor/models/radio.py +573 -0
  1271. vllm/model_executor/models/registry.py +1218 -0
  1272. vllm/model_executor/models/roberta.py +239 -0
  1273. vllm/model_executor/models/rvl.py +107 -0
  1274. vllm/model_executor/models/seed_oss.py +492 -0
  1275. vllm/model_executor/models/siglip.py +1259 -0
  1276. vllm/model_executor/models/siglip2.py +495 -0
  1277. vllm/model_executor/models/siglip2navit.py +660 -0
  1278. vllm/model_executor/models/skyworkr1v.py +951 -0
  1279. vllm/model_executor/models/smolvlm.py +38 -0
  1280. vllm/model_executor/models/solar.py +484 -0
  1281. vllm/model_executor/models/stablelm.py +354 -0
  1282. vllm/model_executor/models/starcoder2.py +365 -0
  1283. vllm/model_executor/models/step3_text.py +554 -0
  1284. vllm/model_executor/models/step3_vl.py +1147 -0
  1285. vllm/model_executor/models/swin.py +500 -0
  1286. vllm/model_executor/models/tarsier.py +624 -0
  1287. vllm/model_executor/models/telechat2.py +153 -0
  1288. vllm/model_executor/models/teleflm.py +78 -0
  1289. vllm/model_executor/models/terratorch.py +318 -0
  1290. vllm/model_executor/models/transformers/__init__.py +127 -0
  1291. vllm/model_executor/models/transformers/base.py +523 -0
  1292. vllm/model_executor/models/transformers/causal.py +65 -0
  1293. vllm/model_executor/models/transformers/legacy.py +90 -0
  1294. vllm/model_executor/models/transformers/moe.py +329 -0
  1295. vllm/model_executor/models/transformers/multimodal.py +441 -0
  1296. vllm/model_executor/models/transformers/pooling.py +102 -0
  1297. vllm/model_executor/models/transformers/utils.py +253 -0
  1298. vllm/model_executor/models/ultravox.py +786 -0
  1299. vllm/model_executor/models/utils.py +832 -0
  1300. vllm/model_executor/models/vision.py +546 -0
  1301. vllm/model_executor/models/voxtral.py +867 -0
  1302. vllm/model_executor/models/voxtral_streaming.py +304 -0
  1303. vllm/model_executor/models/whisper.py +993 -0
  1304. vllm/model_executor/models/whisper_utils.py +299 -0
  1305. vllm/model_executor/models/zamba2.py +986 -0
  1306. vllm/model_executor/parameter.py +642 -0
  1307. vllm/model_executor/utils.py +113 -0
  1308. vllm/model_executor/warmup/__init__.py +0 -0
  1309. vllm/model_executor/warmup/deep_gemm_warmup.py +371 -0
  1310. vllm/model_executor/warmup/kernel_warmup.py +97 -0
  1311. vllm/model_inspection.py +136 -0
  1312. vllm/multimodal/__init__.py +38 -0
  1313. vllm/multimodal/audio.py +287 -0
  1314. vllm/multimodal/base.py +60 -0
  1315. vllm/multimodal/cache.py +829 -0
  1316. vllm/multimodal/evs.py +294 -0
  1317. vllm/multimodal/hasher.py +123 -0
  1318. vllm/multimodal/image.py +155 -0
  1319. vllm/multimodal/inputs.py +1027 -0
  1320. vllm/multimodal/parse.py +674 -0
  1321. vllm/multimodal/processing.py +2469 -0
  1322. vllm/multimodal/profiling.py +351 -0
  1323. vllm/multimodal/registry.py +375 -0
  1324. vllm/multimodal/utils.py +550 -0
  1325. vllm/multimodal/video.py +512 -0
  1326. vllm/outputs.py +347 -0
  1327. vllm/platforms/__init__.py +277 -0
  1328. vllm/platforms/cpu.py +423 -0
  1329. vllm/platforms/cuda.py +618 -0
  1330. vllm/platforms/interface.py +707 -0
  1331. vllm/platforms/rocm.py +586 -0
  1332. vllm/platforms/tpu.py +20 -0
  1333. vllm/platforms/xpu.py +262 -0
  1334. vllm/plugins/__init__.py +81 -0
  1335. vllm/plugins/io_processors/__init__.py +68 -0
  1336. vllm/plugins/io_processors/interface.py +77 -0
  1337. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1338. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1339. vllm/pooling_params.py +229 -0
  1340. vllm/profiler/__init__.py +0 -0
  1341. vllm/profiler/layerwise_profile.py +392 -0
  1342. vllm/profiler/utils.py +151 -0
  1343. vllm/profiler/wrapper.py +241 -0
  1344. vllm/py.typed +2 -0
  1345. vllm/ray/__init__.py +0 -0
  1346. vllm/ray/lazy_utils.py +30 -0
  1347. vllm/ray/ray_env.py +79 -0
  1348. vllm/reasoning/__init__.py +96 -0
  1349. vllm/reasoning/abs_reasoning_parsers.py +318 -0
  1350. vllm/reasoning/basic_parsers.py +175 -0
  1351. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1352. vllm/reasoning/deepseek_v3_reasoning_parser.py +69 -0
  1353. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1354. vllm/reasoning/glm4_moe_reasoning_parser.py +13 -0
  1355. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1356. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1357. vllm/reasoning/holo2_reasoning_parser.py +89 -0
  1358. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1359. vllm/reasoning/identity_reasoning_parser.py +63 -0
  1360. vllm/reasoning/minimax_m2_reasoning_parser.py +110 -0
  1361. vllm/reasoning/mistral_reasoning_parser.py +154 -0
  1362. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1363. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1364. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1365. vllm/reasoning/step3_reasoning_parser.py +113 -0
  1366. vllm/sampling_params.py +629 -0
  1367. vllm/scalar_type.py +355 -0
  1368. vllm/scripts.py +17 -0
  1369. vllm/sequence.py +64 -0
  1370. vllm/tasks.py +13 -0
  1371. vllm/third_party/__init__.py +0 -0
  1372. vllm/third_party/pynvml.py +6140 -0
  1373. vllm/tokenizers/__init__.py +18 -0
  1374. vllm/tokenizers/deepseek_v32.py +187 -0
  1375. vllm/tokenizers/deepseek_v32_encoding.py +463 -0
  1376. vllm/tokenizers/detokenizer_utils.py +198 -0
  1377. vllm/tokenizers/grok2.py +443 -0
  1378. vllm/tokenizers/hf.py +119 -0
  1379. vllm/tokenizers/mistral.py +543 -0
  1380. vllm/tokenizers/protocol.py +123 -0
  1381. vllm/tokenizers/registry.py +238 -0
  1382. vllm/tool_parsers/__init__.py +158 -0
  1383. vllm/tool_parsers/abstract_tool_parser.py +274 -0
  1384. vllm/tool_parsers/deepseekv31_tool_parser.py +388 -0
  1385. vllm/tool_parsers/deepseekv32_tool_parser.py +591 -0
  1386. vllm/tool_parsers/deepseekv3_tool_parser.py +390 -0
  1387. vllm/tool_parsers/ernie45_tool_parser.py +210 -0
  1388. vllm/tool_parsers/functiongemma_tool_parser.py +321 -0
  1389. vllm/tool_parsers/gigachat3_tool_parser.py +190 -0
  1390. vllm/tool_parsers/glm47_moe_tool_parser.py +23 -0
  1391. vllm/tool_parsers/glm4_moe_tool_parser.py +215 -0
  1392. vllm/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  1393. vllm/tool_parsers/granite_tool_parser.py +253 -0
  1394. vllm/tool_parsers/hermes_tool_parser.py +495 -0
  1395. vllm/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  1396. vllm/tool_parsers/internlm2_tool_parser.py +227 -0
  1397. vllm/tool_parsers/jamba_tool_parser.py +323 -0
  1398. vllm/tool_parsers/kimi_k2_tool_parser.py +598 -0
  1399. vllm/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  1400. vllm/tool_parsers/llama_tool_parser.py +324 -0
  1401. vllm/tool_parsers/longcat_tool_parser.py +37 -0
  1402. vllm/tool_parsers/minimax_m2_tool_parser.py +776 -0
  1403. vllm/tool_parsers/minimax_tool_parser.py +849 -0
  1404. vllm/tool_parsers/mistral_tool_parser.py +612 -0
  1405. vllm/tool_parsers/olmo3_tool_parser.py +366 -0
  1406. vllm/tool_parsers/openai_tool_parser.py +111 -0
  1407. vllm/tool_parsers/phi4mini_tool_parser.py +120 -0
  1408. vllm/tool_parsers/pythonic_tool_parser.py +332 -0
  1409. vllm/tool_parsers/qwen3coder_tool_parser.py +781 -0
  1410. vllm/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  1411. vllm/tool_parsers/seed_oss_tool_parser.py +744 -0
  1412. vllm/tool_parsers/step3_tool_parser.py +303 -0
  1413. vllm/tool_parsers/utils.py +229 -0
  1414. vllm/tool_parsers/xlam_tool_parser.py +556 -0
  1415. vllm/tracing.py +135 -0
  1416. vllm/transformers_utils/__init__.py +26 -0
  1417. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1418. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1419. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1420. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1421. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1422. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1423. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1424. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1425. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1426. vllm/transformers_utils/config.py +1169 -0
  1427. vllm/transformers_utils/config_parser_base.py +20 -0
  1428. vllm/transformers_utils/configs/__init__.py +106 -0
  1429. vllm/transformers_utils/configs/afmoe.py +87 -0
  1430. vllm/transformers_utils/configs/arctic.py +216 -0
  1431. vllm/transformers_utils/configs/bagel.py +53 -0
  1432. vllm/transformers_utils/configs/chatglm.py +75 -0
  1433. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1434. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1435. vllm/transformers_utils/configs/eagle.py +90 -0
  1436. vllm/transformers_utils/configs/falcon.py +89 -0
  1437. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1438. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1439. vllm/transformers_utils/configs/isaac.py +100 -0
  1440. vllm/transformers_utils/configs/jais.py +243 -0
  1441. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1442. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1443. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1444. vllm/transformers_utils/configs/medusa.py +65 -0
  1445. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1446. vllm/transformers_utils/configs/mistral.py +263 -0
  1447. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1448. vllm/transformers_utils/configs/moonvit.py +33 -0
  1449. vllm/transformers_utils/configs/nemotron.py +220 -0
  1450. vllm/transformers_utils/configs/nemotron_h.py +284 -0
  1451. vllm/transformers_utils/configs/olmo3.py +83 -0
  1452. vllm/transformers_utils/configs/ovis.py +182 -0
  1453. vllm/transformers_utils/configs/qwen3_next.py +277 -0
  1454. vllm/transformers_utils/configs/radio.py +98 -0
  1455. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1456. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1457. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1458. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1459. vllm/transformers_utils/configs/tarsier2.py +24 -0
  1460. vllm/transformers_utils/configs/ultravox.py +120 -0
  1461. vllm/transformers_utils/dynamic_module.py +70 -0
  1462. vllm/transformers_utils/gguf_utils.py +280 -0
  1463. vllm/transformers_utils/model_arch_config_convertor.py +402 -0
  1464. vllm/transformers_utils/processor.py +424 -0
  1465. vllm/transformers_utils/processors/__init__.py +25 -0
  1466. vllm/transformers_utils/processors/bagel.py +78 -0
  1467. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1468. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1469. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1470. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1471. vllm/transformers_utils/processors/ovis.py +453 -0
  1472. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1473. vllm/transformers_utils/repo_utils.py +287 -0
  1474. vllm/transformers_utils/runai_utils.py +102 -0
  1475. vllm/transformers_utils/s3_utils.py +95 -0
  1476. vllm/transformers_utils/tokenizer.py +19 -0
  1477. vllm/transformers_utils/utils.py +112 -0
  1478. vllm/triton_utils/__init__.py +20 -0
  1479. vllm/triton_utils/importing.py +103 -0
  1480. vllm/usage/__init__.py +0 -0
  1481. vllm/usage/usage_lib.py +278 -0
  1482. vllm/utils/__init__.py +36 -0
  1483. vllm/utils/argparse_utils.py +491 -0
  1484. vllm/utils/async_utils.py +310 -0
  1485. vllm/utils/cache.py +214 -0
  1486. vllm/utils/collection_utils.py +112 -0
  1487. vllm/utils/counter.py +45 -0
  1488. vllm/utils/deep_gemm.py +424 -0
  1489. vllm/utils/flashinfer.py +602 -0
  1490. vllm/utils/func_utils.py +236 -0
  1491. vllm/utils/gc_utils.py +151 -0
  1492. vllm/utils/hashing.py +117 -0
  1493. vllm/utils/import_utils.py +438 -0
  1494. vllm/utils/jsontree.py +158 -0
  1495. vllm/utils/math_utils.py +32 -0
  1496. vllm/utils/mem_constants.py +13 -0
  1497. vllm/utils/mem_utils.py +285 -0
  1498. vllm/utils/nccl.py +64 -0
  1499. vllm/utils/network_utils.py +331 -0
  1500. vllm/utils/nvtx_pytorch_hooks.py +286 -0
  1501. vllm/utils/platform_utils.py +59 -0
  1502. vllm/utils/profiling.py +56 -0
  1503. vllm/utils/registry.py +51 -0
  1504. vllm/utils/serial_utils.py +214 -0
  1505. vllm/utils/system_utils.py +296 -0
  1506. vllm/utils/tensor_schema.py +255 -0
  1507. vllm/utils/torch_utils.py +781 -0
  1508. vllm/v1/__init__.py +0 -0
  1509. vllm/v1/attention/__init__.py +0 -0
  1510. vllm/v1/attention/backend.py +736 -0
  1511. vllm/v1/attention/backends/__init__.py +0 -0
  1512. vllm/v1/attention/backends/cpu_attn.py +501 -0
  1513. vllm/v1/attention/backends/fa_utils.py +126 -0
  1514. vllm/v1/attention/backends/flash_attn.py +1092 -0
  1515. vllm/v1/attention/backends/flash_attn_diffkv.py +277 -0
  1516. vllm/v1/attention/backends/flashinfer.py +1713 -0
  1517. vllm/v1/attention/backends/flex_attention.py +1024 -0
  1518. vllm/v1/attention/backends/gdn_attn.py +382 -0
  1519. vllm/v1/attention/backends/linear_attn.py +77 -0
  1520. vllm/v1/attention/backends/mamba1_attn.py +28 -0
  1521. vllm/v1/attention/backends/mamba2_attn.py +256 -0
  1522. vllm/v1/attention/backends/mamba_attn.py +313 -0
  1523. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1524. vllm/v1/attention/backends/mla/aiter_triton_mla.py +66 -0
  1525. vllm/v1/attention/backends/mla/common.py +2156 -0
  1526. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1527. vllm/v1/attention/backends/mla/flashattn_mla.py +348 -0
  1528. vllm/v1/attention/backends/mla/flashinfer_mla.py +175 -0
  1529. vllm/v1/attention/backends/mla/flashmla.py +321 -0
  1530. vllm/v1/attention/backends/mla/flashmla_sparse.py +1021 -0
  1531. vllm/v1/attention/backends/mla/indexer.py +345 -0
  1532. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +284 -0
  1533. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +321 -0
  1534. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1535. vllm/v1/attention/backends/registry.py +258 -0
  1536. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1537. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1538. vllm/v1/attention/backends/rocm_attn.py +405 -0
  1539. vllm/v1/attention/backends/short_conv_attn.py +26 -0
  1540. vllm/v1/attention/backends/tree_attn.py +430 -0
  1541. vllm/v1/attention/backends/triton_attn.py +578 -0
  1542. vllm/v1/attention/backends/utils.py +978 -0
  1543. vllm/v1/attention/ops/__init__.py +0 -0
  1544. vllm/v1/attention/ops/chunked_prefill_paged_decode.py +459 -0
  1545. vllm/v1/attention/ops/common.py +469 -0
  1546. vllm/v1/attention/ops/flashmla.py +254 -0
  1547. vllm/v1/attention/ops/merge_attn_states.py +47 -0
  1548. vllm/v1/attention/ops/paged_attn.py +51 -0
  1549. vllm/v1/attention/ops/pallas_kv_cache_update.py +130 -0
  1550. vllm/v1/attention/ops/prefix_prefill.py +862 -0
  1551. vllm/v1/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  1552. vllm/v1/attention/ops/triton_decode_attention.py +709 -0
  1553. vllm/v1/attention/ops/triton_merge_attn_states.py +116 -0
  1554. vllm/v1/attention/ops/triton_prefill_attention.py +272 -0
  1555. vllm/v1/attention/ops/triton_reshape_and_cache_flash.py +395 -0
  1556. vllm/v1/attention/ops/triton_unified_attention.py +1088 -0
  1557. vllm/v1/attention/ops/vit_attn_wrappers.py +185 -0
  1558. vllm/v1/attention/selector.py +145 -0
  1559. vllm/v1/core/__init__.py +0 -0
  1560. vllm/v1/core/block_pool.py +489 -0
  1561. vllm/v1/core/encoder_cache_manager.py +402 -0
  1562. vllm/v1/core/kv_cache_coordinator.py +560 -0
  1563. vllm/v1/core/kv_cache_manager.py +485 -0
  1564. vllm/v1/core/kv_cache_metrics.py +96 -0
  1565. vllm/v1/core/kv_cache_utils.py +1642 -0
  1566. vllm/v1/core/sched/__init__.py +0 -0
  1567. vllm/v1/core/sched/async_scheduler.py +66 -0
  1568. vllm/v1/core/sched/interface.py +205 -0
  1569. vllm/v1/core/sched/output.py +261 -0
  1570. vllm/v1/core/sched/request_queue.py +208 -0
  1571. vllm/v1/core/sched/scheduler.py +1936 -0
  1572. vllm/v1/core/sched/utils.py +64 -0
  1573. vllm/v1/core/single_type_kv_cache_manager.py +926 -0
  1574. vllm/v1/cudagraph_dispatcher.py +183 -0
  1575. vllm/v1/engine/__init__.py +224 -0
  1576. vllm/v1/engine/async_llm.py +874 -0
  1577. vllm/v1/engine/coordinator.py +396 -0
  1578. vllm/v1/engine/core.py +1614 -0
  1579. vllm/v1/engine/core_client.py +1422 -0
  1580. vllm/v1/engine/detokenizer.py +351 -0
  1581. vllm/v1/engine/exceptions.py +18 -0
  1582. vllm/v1/engine/input_processor.py +713 -0
  1583. vllm/v1/engine/llm_engine.py +415 -0
  1584. vllm/v1/engine/logprobs.py +245 -0
  1585. vllm/v1/engine/output_processor.py +715 -0
  1586. vllm/v1/engine/parallel_sampling.py +150 -0
  1587. vllm/v1/engine/utils.py +1086 -0
  1588. vllm/v1/executor/__init__.py +6 -0
  1589. vllm/v1/executor/abstract.py +352 -0
  1590. vllm/v1/executor/multiproc_executor.py +888 -0
  1591. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1592. vllm/v1/executor/ray_executor.py +623 -0
  1593. vllm/v1/executor/ray_utils.py +468 -0
  1594. vllm/v1/executor/uniproc_executor.py +186 -0
  1595. vllm/v1/kv_cache_interface.py +485 -0
  1596. vllm/v1/kv_offload/__init__.py +0 -0
  1597. vllm/v1/kv_offload/abstract.py +161 -0
  1598. vllm/v1/kv_offload/arc_manager.py +237 -0
  1599. vllm/v1/kv_offload/backend.py +97 -0
  1600. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1601. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1602. vllm/v1/kv_offload/cpu.py +109 -0
  1603. vllm/v1/kv_offload/factory.py +58 -0
  1604. vllm/v1/kv_offload/lru_manager.py +139 -0
  1605. vllm/v1/kv_offload/mediums.py +39 -0
  1606. vllm/v1/kv_offload/spec.py +70 -0
  1607. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1608. vllm/v1/kv_offload/worker/cpu_gpu.py +287 -0
  1609. vllm/v1/kv_offload/worker/worker.py +163 -0
  1610. vllm/v1/metrics/__init__.py +0 -0
  1611. vllm/v1/metrics/loggers.py +1320 -0
  1612. vllm/v1/metrics/perf.py +1244 -0
  1613. vllm/v1/metrics/prometheus.py +82 -0
  1614. vllm/v1/metrics/ray_wrappers.py +194 -0
  1615. vllm/v1/metrics/reader.py +257 -0
  1616. vllm/v1/metrics/stats.py +440 -0
  1617. vllm/v1/outputs.py +242 -0
  1618. vllm/v1/pool/__init__.py +0 -0
  1619. vllm/v1/pool/metadata.py +124 -0
  1620. vllm/v1/request.py +281 -0
  1621. vllm/v1/sample/__init__.py +0 -0
  1622. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1623. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1624. vllm/v1/sample/logits_processor/interface.py +106 -0
  1625. vllm/v1/sample/logits_processor/state.py +165 -0
  1626. vllm/v1/sample/metadata.py +44 -0
  1627. vllm/v1/sample/ops/__init__.py +0 -0
  1628. vllm/v1/sample/ops/bad_words.py +57 -0
  1629. vllm/v1/sample/ops/logprobs.py +25 -0
  1630. vllm/v1/sample/ops/penalties.py +57 -0
  1631. vllm/v1/sample/ops/topk_topp_sampler.py +388 -0
  1632. vllm/v1/sample/rejection_sampler.py +822 -0
  1633. vllm/v1/sample/sampler.py +319 -0
  1634. vllm/v1/sample/tpu/__init__.py +0 -0
  1635. vllm/v1/sample/tpu/metadata.py +120 -0
  1636. vllm/v1/sample/tpu/sampler.py +215 -0
  1637. vllm/v1/serial_utils.py +514 -0
  1638. vllm/v1/spec_decode/__init__.py +0 -0
  1639. vllm/v1/spec_decode/eagle.py +1346 -0
  1640. vllm/v1/spec_decode/medusa.py +73 -0
  1641. vllm/v1/spec_decode/metadata.py +66 -0
  1642. vllm/v1/spec_decode/metrics.py +225 -0
  1643. vllm/v1/spec_decode/ngram_proposer.py +281 -0
  1644. vllm/v1/spec_decode/suffix_decoding.py +95 -0
  1645. vllm/v1/spec_decode/utils.py +109 -0
  1646. vllm/v1/structured_output/__init__.py +337 -0
  1647. vllm/v1/structured_output/backend_guidance.py +291 -0
  1648. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1649. vllm/v1/structured_output/backend_outlines.py +324 -0
  1650. vllm/v1/structured_output/backend_types.py +136 -0
  1651. vllm/v1/structured_output/backend_xgrammar.py +378 -0
  1652. vllm/v1/structured_output/request.py +91 -0
  1653. vllm/v1/structured_output/utils.py +457 -0
  1654. vllm/v1/utils.py +466 -0
  1655. vllm/v1/worker/__init__.py +0 -0
  1656. vllm/v1/worker/block_table.py +343 -0
  1657. vllm/v1/worker/cp_utils.py +42 -0
  1658. vllm/v1/worker/cpu_model_runner.py +122 -0
  1659. vllm/v1/worker/cpu_worker.py +192 -0
  1660. vllm/v1/worker/dp_utils.py +240 -0
  1661. vllm/v1/worker/ec_connector_model_runner_mixin.py +85 -0
  1662. vllm/v1/worker/gpu/README.md +4 -0
  1663. vllm/v1/worker/gpu/__init__.py +0 -0
  1664. vllm/v1/worker/gpu/async_utils.py +98 -0
  1665. vllm/v1/worker/gpu/attn_utils.py +183 -0
  1666. vllm/v1/worker/gpu/block_table.py +222 -0
  1667. vllm/v1/worker/gpu/buffer_utils.py +224 -0
  1668. vllm/v1/worker/gpu/cudagraph_utils.py +264 -0
  1669. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1670. vllm/v1/worker/gpu/input_batch.py +526 -0
  1671. vllm/v1/worker/gpu/metrics/__init__.py +0 -0
  1672. vllm/v1/worker/gpu/metrics/logits.py +42 -0
  1673. vllm/v1/worker/gpu/mm/__init__.py +0 -0
  1674. vllm/v1/worker/gpu/mm/mrope_utils.py +127 -0
  1675. vllm/v1/worker/gpu/model_runner.py +1005 -0
  1676. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1677. vllm/v1/worker/gpu/sample/gumbel.py +106 -0
  1678. vllm/v1/worker/gpu/sample/logit_bias.py +270 -0
  1679. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1680. vllm/v1/worker/gpu/sample/metadata.py +79 -0
  1681. vllm/v1/worker/gpu/sample/min_p.py +58 -0
  1682. vllm/v1/worker/gpu/sample/output.py +14 -0
  1683. vllm/v1/worker/gpu/sample/penalties.py +155 -0
  1684. vllm/v1/worker/gpu/sample/sampler.py +88 -0
  1685. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1686. vllm/v1/worker/gpu/spec_decode/eagle.py +566 -0
  1687. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1688. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +71 -0
  1689. vllm/v1/worker/gpu/states.py +282 -0
  1690. vllm/v1/worker/gpu/structured_outputs.py +100 -0
  1691. vllm/v1/worker/gpu_input_batch.py +1030 -0
  1692. vllm/v1/worker/gpu_model_runner.py +5761 -0
  1693. vllm/v1/worker/gpu_ubatch_wrapper.py +475 -0
  1694. vllm/v1/worker/gpu_worker.py +968 -0
  1695. vllm/v1/worker/kv_connector_model_runner_mixin.py +300 -0
  1696. vllm/v1/worker/lora_model_runner_mixin.py +225 -0
  1697. vllm/v1/worker/tpu_input_batch.py +574 -0
  1698. vllm/v1/worker/tpu_worker.py +18 -0
  1699. vllm/v1/worker/ubatch_utils.py +112 -0
  1700. vllm/v1/worker/ubatching.py +242 -0
  1701. vllm/v1/worker/utils.py +400 -0
  1702. vllm/v1/worker/worker_base.py +372 -0
  1703. vllm/v1/worker/workspace.py +253 -0
  1704. vllm/v1/worker/xpu_model_runner.py +48 -0
  1705. vllm/v1/worker/xpu_worker.py +174 -0
  1706. vllm/version.py +39 -0
  1707. vllm/vllm_flash_attn/.gitkeep +0 -0
  1708. vllm_cpu_avx512bf16-0.14.0.dist-info/METADATA +348 -0
  1709. vllm_cpu_avx512bf16-0.14.0.dist-info/RECORD +1712 -0
  1710. vllm_cpu_avx512bf16-0.14.0.dist-info/WHEEL +5 -0
  1711. vllm_cpu_avx512bf16-0.14.0.dist-info/entry_points.txt +5 -0
  1712. vllm_cpu_avx512bf16-0.14.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2688 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ import contextlib
4
+ import copy
5
+ import logging
6
+ import math
7
+ import queue
8
+ import threading
9
+ import time
10
+ import uuid
11
+ from collections import defaultdict
12
+ from collections.abc import Iterator
13
+ from concurrent.futures import Future, ThreadPoolExecutor
14
+ from dataclasses import dataclass
15
+ from typing import TYPE_CHECKING, Any, Optional
16
+
17
+ import msgspec
18
+ import numpy as np
19
+ import torch
20
+ import zmq
21
+
22
+ from vllm import envs
23
+ from vllm.config import VllmConfig
24
+ from vllm.distributed.kv_transfer.kv_connector.utils import (
25
+ EngineId,
26
+ TpKVTopology,
27
+ get_current_attn_backend,
28
+ kv_postprocess_blksize_and_layout_on_receive,
29
+ kv_postprocess_blksize_on_receive,
30
+ kv_postprocess_layout_on_receive,
31
+ yield_req_data,
32
+ )
33
+ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
34
+ CopyBlocksOp,
35
+ KVConnectorBase_V1,
36
+ KVConnectorHandshakeMetadata,
37
+ KVConnectorMetadata,
38
+ KVConnectorRole,
39
+ )
40
+ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
41
+ KVConnectorPromMetrics,
42
+ KVConnectorStats,
43
+ PromMetric,
44
+ PromMetricT,
45
+ )
46
+ from vllm.distributed.parallel_state import (
47
+ get_tensor_model_parallel_rank,
48
+ get_tensor_model_parallel_world_size,
49
+ get_tp_group,
50
+ )
51
+ from vllm.forward_context import ForwardContext
52
+ from vllm.logger import init_logger
53
+ from vllm.platforms import current_platform
54
+ from vllm.utils.network_utils import make_zmq_path, make_zmq_socket
55
+ from vllm.v1.attention.backend import AttentionMetadata
56
+ from vllm.v1.attention.backends.utils import get_kv_cache_layout
57
+ from vllm.v1.core.sched.output import SchedulerOutput
58
+ from vllm.v1.worker.block_table import BlockTable
59
+
60
+ if TYPE_CHECKING:
61
+ from vllm.v1.core.kv_cache_manager import KVCacheBlocks
62
+ from vllm.v1.kv_cache_interface import KVCacheConfig
63
+ from vllm.v1.request import Request
64
+
65
+ TransferHandle = int
66
+ ReqId = str
67
+
68
+ #
69
+ # NIXL Connector Version
70
+ #
71
+ # Increment this version whenever there is an incompatible change to:
72
+ # - NixlAgentMetadata schema
73
+ # - kv_transfer_params schema or semantics
74
+ # - NIXL transfer protocol or wire format
75
+ # - KV cache memory layout or block organization
76
+ # - Any other change that breaks P/D interoperability
77
+ #
78
+ # Version History:
79
+ # 1: Initial version with compatibility checking
80
+ # 2: Add remote_request_id to kv_transfer_params
81
+ #
82
+ NIXL_CONNECTOR_VERSION: int = 2
83
+
84
+ GET_META_MSG = b"get_meta_msg"
85
+
86
+ logger = init_logger(__name__)
87
+
88
+ # Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
89
+ try:
90
+ if not current_platform.is_rocm():
91
+ from nixl._api import nixl_agent as NixlWrapper
92
+ from nixl._bindings import nixlXferTelemetry
93
+ else:
94
+ from rixl._api import nixl_agent as NixlWrapper
95
+ from rixl._bindings import nixlXferTelemetry
96
+
97
+ logger.info("NIXL is available")
98
+ except ImportError:
99
+ logger.warning("NIXL is not available")
100
+ NixlWrapper = None
101
+ nixlXferTelemetry = None
102
+
103
+
104
+ try:
105
+ if not current_platform.is_rocm():
106
+ from nixl._api import nixl_agent_config
107
+ else:
108
+ from rixl._api import nixl_agent_config
109
+ except ImportError:
110
+ nixl_agent_config = None
111
+ logger.warning("NIXL agent config is not available")
112
+
113
+ # Supported platforms and types of kv transfer buffer.
114
+ # {device: tuple of supported kv buffer types}
115
+ _NIXL_SUPPORTED_DEVICE = {
116
+ "cuda": (
117
+ "cuda",
118
+ "cpu",
119
+ ),
120
+ "tpu": ("cpu",),
121
+ "xpu": ("cpu",),
122
+ "cpu": ("cpu",),
123
+ }
124
+ # support for oot platform by providing mapping in current_platform
125
+ _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices())
126
+
127
+
128
+ @dataclass
129
+ class NixlAgentMetadata:
130
+ engine_id: str
131
+ agent_metadata: bytes
132
+ kv_caches_base_addr: list[int]
133
+ device_id: int
134
+ num_blocks: int
135
+ block_lens: list[int]
136
+ kv_cache_layout: str
137
+ block_size: int
138
+
139
+
140
+ @dataclass
141
+ class NixlHandshakePayload(KVConnectorHandshakeMetadata):
142
+ """
143
+ Wrapper for NIXL handshake sent over the wire.
144
+
145
+ Enables two-phase decoding for graceful compatibility checking:
146
+ 1. Decode NixlHandshakePayload to get compatibility_hash
147
+ 2. Compute local hash and compare
148
+ 3. Only if hashes match, decode agent_metadata_bytes
149
+
150
+ This prevents decoder errors when NixlAgentMetadata schema is
151
+ incompatible, allowing graceful failure with clear error message.
152
+ """
153
+
154
+ compatibility_hash: str
155
+ agent_metadata_bytes: bytes # NixlAgentMetadata encoded
156
+
157
+
158
+ def compute_nixl_compatibility_hash(
159
+ vllm_config: VllmConfig, attn_backend_name: str
160
+ ) -> str:
161
+ """
162
+ Compute compatibility hash for NIXL KV transfer.
163
+
164
+ Hash only the factors that affect whether two NIXL instances can
165
+ successfully transfer KV cache data.
166
+
167
+ Factors included:
168
+ - vLLM version and NIXL connector version
169
+ - Model architecture (name, dtype, KV heads, layers)
170
+ - KV cache format (dtype, sliding window)
171
+ - Attention backend
172
+
173
+ Note: Factors like tensor_parallel_size, block_size, and kv_cache_layout
174
+ are validated at runtime in _validate_remote_agent_handshake and are not
175
+ included in this hash to support heterogeneous deployments.
176
+
177
+ Note - the set of factors are likely to evolve significantly over
178
+ time to be more or less permissive.
179
+
180
+ Returns:
181
+ SHA-256 hex digest
182
+ """
183
+ from vllm import __version__ as vllm_version
184
+ from vllm.config.utils import hash_factors
185
+
186
+ model_config = vllm_config.model_config
187
+ cache_config = vllm_config.cache_config
188
+
189
+ factors = {
190
+ # Version compatibility
191
+ "vllm_version": vllm_version,
192
+ "nixl_connector_version": NIXL_CONNECTOR_VERSION,
193
+ # Model architecture - affects KV cache shape
194
+ "model": model_config.model,
195
+ "dtype": str(model_config.dtype),
196
+ "num_kv_heads": model_config.get_total_num_kv_heads(),
197
+ "head_size": model_config.get_head_size(),
198
+ "num_hidden_layers": model_config.get_total_num_hidden_layers(),
199
+ # Attention backend and KV cache dtype affect memory layout
200
+ "attn_backend_name": attn_backend_name,
201
+ "cache_dtype": str(cache_config.cache_dtype),
202
+ }
203
+
204
+ compat_hash = hash_factors(factors)
205
+ logger.debug(
206
+ "NIXL compatibility hash: %s (model=%s, dtype=%s, num_kv_heads=%d, "
207
+ "cache_dtype=%s, attn_backend=%s)",
208
+ compat_hash,
209
+ factors["model"],
210
+ factors["dtype"],
211
+ factors["num_kv_heads"],
212
+ factors["cache_dtype"],
213
+ attn_backend_name,
214
+ )
215
+ return compat_hash
216
+
217
+
218
+ @dataclass
219
+ class RemoteMeta:
220
+ block_ids: list[int]
221
+ host: str
222
+ port: int
223
+ engine_id: str
224
+ request_id: str
225
+
226
+
227
+ @dataclass
228
+ class ReqMeta:
229
+ local_block_ids: list[int]
230
+ # To be used when logical block size does not match the kernel block size
231
+ local_physical_block_ids: list[int]
232
+ tp_size: int
233
+ remote: RemoteMeta | None = None
234
+
235
+
236
+ class NixlConnectorMetadata(KVConnectorMetadata):
237
+ def __init__(self):
238
+ self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
239
+ self.reqs_to_save: dict[ReqId, ReqMeta] = {}
240
+ self.reqs_to_send: dict[ReqId, float] = {}
241
+ self.reqs_in_batch: set[ReqId] = set()
242
+ self.reqs_not_processed: set[ReqId] = set()
243
+
244
+ def _add_new_req(
245
+ self,
246
+ local_block_ids: list[int],
247
+ kv_transfer_params: dict[str, Any],
248
+ ) -> ReqMeta:
249
+ return ReqMeta(
250
+ local_block_ids=local_block_ids,
251
+ local_physical_block_ids=local_block_ids,
252
+ # P workers don't need to receive tp_size from proxy here.
253
+ tp_size=kv_transfer_params.get("tp_size", 1),
254
+ )
255
+
256
+ def add_new_req_to_save(
257
+ self,
258
+ request_id: ReqId,
259
+ local_block_ids: list[int],
260
+ kv_transfer_params: dict[str, Any],
261
+ ):
262
+ self.reqs_to_save[request_id] = self._add_new_req(
263
+ local_block_ids, kv_transfer_params
264
+ )
265
+
266
+ def add_new_req_to_recv(
267
+ self,
268
+ request_id: ReqId,
269
+ local_block_ids: list[int],
270
+ kv_transfer_params: dict[str, Any],
271
+ ):
272
+ req = self._add_new_req(local_block_ids, kv_transfer_params)
273
+ req.remote = RemoteMeta(
274
+ block_ids=kv_transfer_params["remote_block_ids"],
275
+ engine_id=kv_transfer_params["remote_engine_id"],
276
+ request_id=kv_transfer_params["remote_request_id"],
277
+ host=kv_transfer_params["remote_host"],
278
+ port=kv_transfer_params["remote_port"],
279
+ )
280
+ self.reqs_to_recv[request_id] = req
281
+
282
+
283
+ class NixlConnector(KVConnectorBase_V1):
284
+ def __init__(
285
+ self,
286
+ vllm_config: VllmConfig,
287
+ role: KVConnectorRole,
288
+ kv_cache_config: Optional["KVCacheConfig"] = None,
289
+ ):
290
+ super().__init__(vllm_config, role, kv_cache_config)
291
+
292
+ assert vllm_config.kv_transfer_config is not None
293
+ assert vllm_config.kv_transfer_config.engine_id is not None
294
+ self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
295
+
296
+ if role == KVConnectorRole.SCHEDULER:
297
+ self.connector_scheduler: NixlConnectorScheduler | None = (
298
+ NixlConnectorScheduler(vllm_config, self.engine_id)
299
+ )
300
+ self.connector_worker: NixlConnectorWorker | None = None
301
+ elif role == KVConnectorRole.WORKER:
302
+ self.connector_scheduler = None
303
+ self.connector_worker = NixlConnectorWorker(vllm_config, self.engine_id)
304
+
305
+ ############################################################
306
+ # Class Methods
307
+ ############################################################
308
+ @classmethod
309
+ def get_required_kvcache_layout(cls, vllm_config: VllmConfig):
310
+ if vllm_config.model_config is None:
311
+ logger.warning_once(
312
+ "Unable to detect current VLLM config. "
313
+ "Fallback to default kv cache layout."
314
+ )
315
+ return None
316
+ use_mla = vllm_config.model_config.use_mla
317
+ if use_mla:
318
+ # return None when we have mla
319
+ # as the layout should not matter in that case,
320
+ # which fallback to the default behavior.
321
+ return None
322
+ logger.info_once(
323
+ "NixlConnector setting KV cache layout to HND for better xfer performance."
324
+ )
325
+ return "HND"
326
+
327
+ ############################################################
328
+ # Scheduler Side Methods
329
+ ############################################################
330
+
331
+ def get_num_new_matched_tokens(
332
+ self, request: "Request", num_computed_tokens: int
333
+ ) -> tuple[int | None, bool]:
334
+ assert self.connector_scheduler is not None
335
+ return self.connector_scheduler.get_num_new_matched_tokens(
336
+ request, num_computed_tokens
337
+ )
338
+
339
+ def update_state_after_alloc(
340
+ self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
341
+ ):
342
+ assert self.connector_scheduler is not None
343
+ return self.connector_scheduler.update_state_after_alloc(
344
+ request, blocks, num_external_tokens
345
+ )
346
+
347
+ def build_connector_meta(
348
+ self,
349
+ scheduler_output: SchedulerOutput,
350
+ ) -> KVConnectorMetadata:
351
+ assert self.connector_scheduler is not None
352
+ return self.connector_scheduler.build_connector_meta(scheduler_output)
353
+
354
+ def request_finished(
355
+ self,
356
+ request: "Request",
357
+ block_ids: list[int],
358
+ ) -> tuple[bool, dict[str, Any] | None]:
359
+ assert self.connector_scheduler is not None
360
+ return self.connector_scheduler.request_finished(request, block_ids)
361
+
362
+ def set_xfer_handshake_metadata(
363
+ self, metadata: dict[int, KVConnectorHandshakeMetadata]
364
+ ) -> None:
365
+ """
366
+ Set the KV connector handshake metadata for this connector.
367
+
368
+ Args:
369
+ metadata (dict): the handshake metadata to set.
370
+ """
371
+ assert self.connector_scheduler is not None
372
+ self.connector_scheduler.set_xfer_handshake_metadata(metadata)
373
+
374
+ ############################################################
375
+ # Worker Side Methods
376
+ ############################################################
377
+ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
378
+ assert self.connector_worker is not None
379
+ self.connector_worker.register_kv_caches(kv_caches)
380
+
381
+ def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
382
+ assert self.connector_worker is not None
383
+ self.connector_worker.set_host_xfer_buffer_ops(copy_operation)
384
+
385
+ def get_finished(self, finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
386
+ """Get the finished recving and sending requests."""
387
+ assert self.connector_worker is not None
388
+ return self.connector_worker.get_finished()
389
+
390
+ def get_block_ids_with_load_errors(self) -> set[int]:
391
+ """Get block IDs that failed to load via NIXL."""
392
+ assert self.connector_worker is not None
393
+ return self.connector_worker.get_block_ids_with_load_errors()
394
+
395
+ def get_kv_connector_stats(self) -> KVConnectorStats | None:
396
+ if self.connector_worker is None:
397
+ return None
398
+ return self.connector_worker.get_kv_connector_stats()
399
+
400
+ @classmethod
401
+ def build_kv_connector_stats(
402
+ cls, data: dict[str, Any] | None = None
403
+ ) -> KVConnectorStats | None:
404
+ return (
405
+ NixlKVConnectorStats(data=data)
406
+ if data is not None
407
+ else NixlKVConnectorStats()
408
+ )
409
+
410
+ @classmethod
411
+ def build_prom_metrics(
412
+ cls,
413
+ vllm_config: VllmConfig,
414
+ metric_types: dict[type[PromMetric], type[PromMetricT]],
415
+ labelnames: list[str],
416
+ per_engine_labelvalues: dict[int, list[object]],
417
+ ) -> KVConnectorPromMetrics:
418
+ return NixlPromMetrics(
419
+ vllm_config, metric_types, labelnames, per_engine_labelvalues
420
+ )
421
+
422
+ def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
423
+ assert self.connector_worker is not None
424
+ assert isinstance(self._connector_metadata, NixlConnectorMetadata)
425
+ self.connector_worker.start_load_kv(self._connector_metadata)
426
+
427
+ def wait_for_layer_load(self, layer_name: str) -> None:
428
+ """NixlConnector does not do layerwise saving."""
429
+ pass
430
+
431
+ def save_kv_layer(
432
+ self,
433
+ layer_name: str,
434
+ kv_layer: torch.Tensor,
435
+ attn_metadata: AttentionMetadata,
436
+ **kwargs,
437
+ ) -> None:
438
+ """NixlConnector does not save explicitly."""
439
+ pass
440
+
441
+ def wait_for_save(self):
442
+ assert self.connector_worker is not None
443
+ assert isinstance(self._connector_metadata, NixlConnectorMetadata)
444
+ if self.connector_worker.use_host_buffer and self.connector_worker.copy_blocks:
445
+ self.connector_worker.save_kv_to_host(self._connector_metadata)
446
+
447
+ def shutdown(self):
448
+ if self.connector_worker is not None:
449
+ self.connector_worker.shutdown()
450
+ if self.connector_scheduler is not None:
451
+ self.connector_scheduler.shutdown()
452
+
453
+ def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
454
+ """
455
+ Get the KVConnector handshake metadata for this connector.
456
+ This metadata is used for out-of-band connector handshake
457
+ between P/D workers.
458
+
459
+ Returns:
460
+ KVConnectorHandshakeMetadata: the handshake metadata.
461
+ None if no handshake metadata is available.
462
+ """
463
+ assert self.connector_worker is not None
464
+ return self.connector_worker.xfer_handshake_metadata
465
+
466
+
467
+ class NixlConnectorScheduler:
468
+ """Implementation of Scheduler side methods"""
469
+
470
+ def __init__(self, vllm_config: VllmConfig, engine_id: str):
471
+ self.vllm_config = vllm_config
472
+ self.block_size = vllm_config.cache_config.block_size
473
+ self.engine_id: EngineId = engine_id
474
+ self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
475
+ self.side_channel_port = (
476
+ envs.VLLM_NIXL_SIDE_CHANNEL_PORT
477
+ + vllm_config.parallel_config.data_parallel_index
478
+ )
479
+ assert vllm_config.kv_transfer_config is not None
480
+ if current_platform.device_type == "cpu":
481
+ self.use_host_buffer = False
482
+ else:
483
+ self.use_host_buffer = (
484
+ vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
485
+ )
486
+
487
+ logger.info("Initializing NIXL Scheduler %s", engine_id)
488
+
489
+ # Background thread for handling new handshake requests.
490
+ self._nixl_handshake_listener_t: threading.Thread | None = None
491
+ self._encoded_xfer_handshake_metadata: dict[int, Any] = {}
492
+ self._stop_event = threading.Event()
493
+
494
+ # Requests that need to start recv/send.
495
+ # New requests are added by update_state_after_alloc in
496
+ # the scheduler. Used to make metadata passed to Worker.
497
+ self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
498
+ self._reqs_need_save: dict[ReqId, Request] = {}
499
+ # Reqs to send and their expiration time
500
+ self._reqs_need_send: dict[ReqId, float] = {}
501
+ self._reqs_in_batch: set[ReqId] = set()
502
+ # Reqs to remove from processed set because they're not to send after
503
+ # remote prefill or aborted.
504
+ self._reqs_not_processed: set[ReqId] = set()
505
+
506
+ def shutdown(self):
507
+ self._stop_event.set()
508
+ if self._nixl_handshake_listener_t is not None:
509
+ self._nixl_handshake_listener_t.join()
510
+ self._nixl_handshake_listener_t = None
511
+
512
+ def set_xfer_handshake_metadata(
513
+ self, metadata: dict[int, KVConnectorHandshakeMetadata]
514
+ ) -> None:
515
+ """
516
+ Set the KV connector handshake metadata for this connector.
517
+
518
+ Args:
519
+ metadata (dict): the handshake metadata to set.
520
+ """
521
+ encoded_data: dict[int, bytes] = {}
522
+ encoder = msgspec.msgpack.Encoder()
523
+ for tp_rank, rank_metadata in metadata.items():
524
+ if not isinstance(rank_metadata, NixlHandshakePayload):
525
+ raise ValueError(
526
+ "NixlConnectorScheduler expects NixlHandshakePayload for "
527
+ "handshake metadata."
528
+ )
529
+ encoded_data[tp_rank] = encoder.encode(rank_metadata)
530
+ logger.debug(
531
+ "Tp rank %d: encoded NixlHandshakePayload size: %s bytes",
532
+ tp_rank,
533
+ str(len(encoded_data[tp_rank])),
534
+ )
535
+ self._encoded_xfer_handshake_metadata = encoded_data
536
+
537
+ # Only start the listener when we have metadata to serve.
538
+ if self._nixl_handshake_listener_t is None:
539
+ ready_event = threading.Event()
540
+ self._nixl_handshake_listener_t = threading.Thread(
541
+ target=self._nixl_handshake_listener,
542
+ args=(
543
+ encoded_data,
544
+ ready_event,
545
+ self._stop_event,
546
+ self.side_channel_port,
547
+ ),
548
+ daemon=True,
549
+ name="nixl_handshake_listener",
550
+ )
551
+ self._nixl_handshake_listener_t.start()
552
+ ready_event.wait() # Wait for listener ZMQ socket to be ready.
553
+
554
+ @staticmethod
555
+ def _nixl_handshake_listener(
556
+ encoded_data: dict[int, Any],
557
+ ready_event: threading.Event,
558
+ stop_event: threading.Event,
559
+ port: int,
560
+ ):
561
+ """Background thread for getting new NIXL handshakes."""
562
+ # NOTE(rob): this is a simple implementation. We will move
563
+ # to a better approach via HTTP endpoint soon.
564
+
565
+ # Listen for new requests for metadata.
566
+ host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
567
+ path = make_zmq_path("tcp", host, port)
568
+ logger.debug("Starting listening on path: %s", path)
569
+ with zmq_ctx(zmq.ROUTER, path) as sock:
570
+ sock.setsockopt(zmq.RCVTIMEO, 1000)
571
+ ready_event.set()
572
+ while True:
573
+ try:
574
+ identity, _, msg = sock.recv_multipart()
575
+ except zmq.Again:
576
+ if stop_event.is_set():
577
+ break
578
+ continue
579
+ # Decode the message which contains (GET_META_MSG, rank)
580
+ msg, target_tp_rank = msgspec.msgpack.decode(msg)
581
+ logger.debug(
582
+ "Received message for tp rank %s",
583
+ target_tp_rank,
584
+ )
585
+ if msg != GET_META_MSG:
586
+ logger.warning("Connection listener got unexpected message %s", msg)
587
+ sock.send_multipart((identity, b"", encoded_data[target_tp_rank]))
588
+
589
+ def get_num_new_matched_tokens(
590
+ self, request: "Request", num_computed_tokens: int
591
+ ) -> tuple[int, bool]:
592
+ """
593
+ For remote prefill, pull all prompt blocks from remote
594
+ asynchronously relative to engine execution.
595
+
596
+ Args:
597
+ request (Request): the request object.
598
+ num_computed_tokens (int): the number of locally
599
+ computed tokens for this request
600
+ Returns:
601
+ * the number of tokens that can be loaded from the
602
+ external KV cache beyond what is already computed.
603
+ * true if the external KV cache tokens will be loaded
604
+ asynchronously (between scheduler steps).
605
+ """
606
+
607
+ params = request.kv_transfer_params
608
+ logger.debug(
609
+ "NIXLConnector get_num_new_matched_tokens: "
610
+ "num_computed_tokens=%s, kv_transfer_params=%s",
611
+ num_computed_tokens,
612
+ params,
613
+ )
614
+
615
+ if params is not None and params.get("do_remote_prefill"):
616
+ # Remote prefill: get all prompt blocks from remote.
617
+ token_ids = request.prompt_token_ids or []
618
+ count = len(token_ids) - num_computed_tokens
619
+ if count > 0:
620
+ return count, True
621
+
622
+ # No remote prefill for this request.
623
+ return 0, False
624
+
625
+ def update_state_after_alloc(
626
+ self, request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int
627
+ ):
628
+ params = request.kv_transfer_params
629
+ logger.debug(
630
+ "NIXLConnector update_state_after_alloc: "
631
+ "num_external_tokens=%s, kv_transfer_params=%s",
632
+ num_external_tokens,
633
+ params,
634
+ )
635
+
636
+ if not params:
637
+ return
638
+
639
+ if params.get("do_remote_decode"):
640
+ self._reqs_in_batch.add(request.request_id)
641
+ if self.use_host_buffer and params.get("do_remote_decode"):
642
+ # NOTE: when accelerator is not directly supported by Nixl,
643
+ # prefilled blocks need to be saved to host memory before transfer.
644
+ self._reqs_need_save[request.request_id] = request
645
+ elif params.get("do_remote_prefill"):
646
+ if params.get("remote_block_ids"):
647
+ if all(
648
+ p in params
649
+ for p in (
650
+ "remote_engine_id",
651
+ "remote_request_id",
652
+ "remote_host",
653
+ "remote_port",
654
+ )
655
+ ):
656
+ # If remote_blocks and num_external_tokens = 0, we have
657
+ # a full prefix cache hit on the D worker. We need to call
658
+ # send_notif in _read_blocks to free the memory on the P.
659
+ local_block_ids = (
660
+ blocks.get_unhashed_block_ids()
661
+ if num_external_tokens > 0
662
+ else []
663
+ )
664
+ # Get unhashed blocks to pull from remote.
665
+ self._reqs_need_recv[request.request_id] = (
666
+ request,
667
+ local_block_ids,
668
+ )
669
+
670
+ else:
671
+ logger.warning(
672
+ "Got invalid KVTransferParams: %s. This "
673
+ "request will not utilize KVTransfer",
674
+ params,
675
+ )
676
+ else:
677
+ assert num_external_tokens == 0
678
+ # Only trigger 1 KV transfer per request.
679
+ params["do_remote_prefill"] = False
680
+
681
+ def build_connector_meta(
682
+ self,
683
+ scheduler_output: SchedulerOutput,
684
+ ) -> KVConnectorMetadata:
685
+ meta = NixlConnectorMetadata()
686
+
687
+ # Loop through scheduled reqs and convert to ReqMeta.
688
+ for req_id, (req, block_ids) in self._reqs_need_recv.items():
689
+ assert req.kv_transfer_params is not None
690
+ meta.add_new_req_to_recv(
691
+ request_id=req_id,
692
+ local_block_ids=block_ids,
693
+ kv_transfer_params=req.kv_transfer_params,
694
+ )
695
+
696
+ # NOTE: For the prefill side, there might be a chance that an early added
697
+ # request is a chunked prefill, so we need to check if new blocks are added
698
+ for req_id, new_block_id_groups, _ in yield_req_data(scheduler_output):
699
+ req_to_save = self._reqs_need_save.get(req_id)
700
+ if req_to_save is None or new_block_id_groups is None:
701
+ continue
702
+ req = req_to_save
703
+
704
+ assert req.kv_transfer_params is not None
705
+ meta.add_new_req_to_save(
706
+ request_id=req_id,
707
+ local_block_ids=new_block_id_groups[0],
708
+ kv_transfer_params=req.kv_transfer_params,
709
+ )
710
+ assert scheduler_output.num_scheduled_tokens is not None
711
+ num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
712
+ is_partial = (
713
+ req.num_computed_tokens + num_scheduled_tokens
714
+ ) < req.num_prompt_tokens
715
+ if not is_partial:
716
+ # For non-partial prefills, once new req_meta is scheduled, it
717
+ # can be removed from _reqs_need_save.
718
+ # For partial prefill case, we will retain the request in
719
+ # _reqs_need_save until all blocks are scheduled with req_meta.
720
+ # Therefore, only pop if `not is_partial`.
721
+ self._reqs_need_save.pop(req_id)
722
+
723
+ meta.reqs_to_send = self._reqs_need_send
724
+ meta.reqs_in_batch = self._reqs_in_batch
725
+ meta.reqs_not_processed = self._reqs_not_processed
726
+
727
+ # Clear the list once workers start the transfers
728
+ self._reqs_need_recv.clear()
729
+ self._reqs_in_batch = set()
730
+ self._reqs_not_processed = set()
731
+ self._reqs_need_send = {}
732
+
733
+ return meta
734
+
735
+ def request_finished(
736
+ self,
737
+ request: "Request",
738
+ block_ids: list[int],
739
+ ) -> tuple[bool, dict[str, Any] | None]:
740
+ """
741
+ Once a request is finished, determine whether request blocks
742
+ should be freed now or will be sent asynchronously and freed later.
743
+ """
744
+ from vllm.v1.request import RequestStatus
745
+
746
+ params = request.kv_transfer_params
747
+ logger.debug(
748
+ "NIXLConnector request_finished(%s), request_status=%s, "
749
+ "kv_transfer_params=%s",
750
+ request.request_id,
751
+ request.status,
752
+ params,
753
+ )
754
+ if not params:
755
+ return False, None
756
+
757
+ if params.get("do_remote_prefill"):
758
+ # If do_remote_prefill is still True when the request is finished,
759
+ # update_state_after_alloc must not have been called (the request
760
+ # must have been aborted before it was scheduled).
761
+ # To avoid stranding the prefill blocks in the prefill instance,
762
+ # we must add empty block_ids to _reqs_need_recv so that our
763
+ # worker side will notify and free blocks in the prefill instance.
764
+ self._reqs_need_recv[request.request_id] = (request, [])
765
+ params["do_remote_prefill"] = False
766
+ return False, None
767
+
768
+ if not params.get("do_remote_decode"):
769
+ return False, None
770
+ if request.status != RequestStatus.FINISHED_LENGTH_CAPPED:
771
+ # Also include the case of a P/D Prefill request with immediate
772
+ # block free (eg abort). Stop tracking this request.
773
+ self._reqs_not_processed.add(request.request_id)
774
+ # Clear _reqs_need_save if a request is aborted as partial prefill.
775
+ self._reqs_need_save.pop(request.request_id, None)
776
+ return False, None
777
+
778
+ # TODO: check whether block_ids actually ever be 0. If not we could
779
+ # remove the conditional below
780
+ delay_free_blocks = len(block_ids) > 0
781
+
782
+ if delay_free_blocks:
783
+ # Prefill request on remote. It will be read from D upon completion
784
+ logger.debug(
785
+ "NIXLConnector request_finished(%s) waiting for %d seconds "
786
+ "for remote decode to fetch blocks",
787
+ request.request_id,
788
+ envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT,
789
+ )
790
+ self._reqs_need_send[request.request_id] = (
791
+ time.perf_counter() + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
792
+ )
793
+
794
+ return delay_free_blocks, dict(
795
+ do_remote_prefill=True,
796
+ do_remote_decode=False,
797
+ remote_block_ids=block_ids,
798
+ remote_engine_id=self.engine_id,
799
+ remote_request_id=request.request_id,
800
+ remote_host=self.side_channel_host,
801
+ remote_port=self.side_channel_port,
802
+ tp_size=self.vllm_config.parallel_config.tensor_parallel_size,
803
+ )
804
+
805
+
806
+ class NixlConnectorWorker:
807
+ """Implementation of Worker side methods"""
808
+
809
+ def __init__(self, vllm_config: VllmConfig, engine_id: str):
810
+ if NixlWrapper is None:
811
+ logger.error("NIXL is not available")
812
+ raise RuntimeError("NIXL is not available")
813
+ logger.info("Initializing NIXL wrapper")
814
+ logger.info("Initializing NIXL worker %s", engine_id)
815
+
816
+ # Config.
817
+ self.vllm_config = vllm_config
818
+ self.block_size = vllm_config.cache_config.block_size
819
+
820
+ if vllm_config.kv_transfer_config is None:
821
+ raise ValueError("kv_transfer_config must be set for NixlConnector")
822
+ self.kv_transfer_config = vllm_config.kv_transfer_config
823
+
824
+ self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
825
+ "backends", ["UCX"]
826
+ )
827
+
828
+ # Agent.
829
+ non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
830
+ # Configure NIXL num_threads to avoid UAR exhaustion on Mellanox NICs.
831
+ # Each UCX thread allocates UARs (doorbell pages) via DevX, and
832
+ # excessive NIXL UAR usage can exhaust NIC UAR space. This can cause
833
+ # components like NVSHMEM (used by DeepEP kernels) to fail during RDMA
834
+ # initialization with "mlx5dv_devx_alloc_uar" errors.
835
+ # Ref: https://network.nvidia.com/files/doc-2020/ethernet-adapters-programming-manual.pdf#page=63
836
+ num_threads = vllm_config.kv_transfer_config.get_from_extra_config(
837
+ "num_threads", 4
838
+ )
839
+ if nixl_agent_config is None:
840
+ config = None
841
+ else:
842
+ # Enable telemetry by default for NIXL 0.7.1 and above.
843
+ config = (
844
+ nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
845
+ if len(non_ucx_backends) > 0
846
+ else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
847
+ )
848
+
849
+ self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
850
+ # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
851
+ self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
852
+
853
+ # Metadata.
854
+ self.engine_id: EngineId = engine_id
855
+ self.tp_rank = get_tensor_model_parallel_rank()
856
+ self.world_size = get_tensor_model_parallel_world_size()
857
+ self.tp_group = get_tp_group()
858
+ self.num_blocks = 0
859
+ self.enable_permute_local_kv = False
860
+
861
+ # KV Caches and nixl tracking data.
862
+ self.device_type = current_platform.device_type
863
+ self.kv_buffer_device: str = vllm_config.kv_transfer_config.kv_buffer_device
864
+ if self.device_type not in _NIXL_SUPPORTED_DEVICE:
865
+ raise RuntimeError(f"{self.device_type} is not supported.")
866
+ elif self.kv_buffer_device not in _NIXL_SUPPORTED_DEVICE[self.device_type]:
867
+ raise RuntimeError(
868
+ f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
869
+ "is not supported."
870
+ )
871
+ self.device_kv_caches: dict[str, torch.Tensor] = {}
872
+
873
+ # cpu kv buffer for xfer
874
+ # used when device memory can not be registered under nixl
875
+ self.host_xfer_buffers: dict[str, torch.Tensor] = {}
876
+ if self.device_type == "cpu":
877
+ self.use_host_buffer = False
878
+ else:
879
+ self.use_host_buffer = self.kv_buffer_device == "cpu"
880
+
881
+ # support for oot platform which can't register nixl memory
882
+ # type based on kv_buffer_device
883
+ nixl_memory_type = current_platform.get_nixl_memory_type()
884
+ if nixl_memory_type is None:
885
+ if self.kv_buffer_device == "cuda":
886
+ nixl_memory_type = "VRAM"
887
+ elif self.kv_buffer_device == "cpu":
888
+ nixl_memory_type = "DRAM"
889
+ if nixl_memory_type is None:
890
+ raise RuntimeError(
891
+ f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
892
+ "is not supported."
893
+ )
894
+ self.nixl_memory_type = nixl_memory_type
895
+
896
+ # Note: host xfer buffer ops when use_host_buffer is True
897
+ self.copy_blocks: CopyBlocksOp | None = None
898
+
899
+ # Map of engine_id -> kv_caches_base_addr. For TP case, each local
900
+ self.device_id: int = 0
901
+ # Current rank may pull from multiple remote TP workers.
902
+ # EngineId, dict[int, list[int]] -> engine_id, tp_rank, base_addr_for_layer
903
+ self.kv_caches_base_addr = defaultdict[EngineId, dict[int, list[int]]](dict)
904
+
905
+ # Number of NIXL regions. Currently one region per cache
906
+ # (so 1 per layer for MLA, otherwise 2 per layer)
907
+ self.num_regions = 0
908
+ self.num_layers = 0
909
+
910
+ # nixl_prepped_dlist_handle.
911
+ self.src_xfer_handles_by_block_size: dict[int, int] = {}
912
+ # Populated dynamically during handshake based on remote configuration.
913
+ # Keep track of regions at different tp_ratio values. tp_ratio->handles
914
+ self.src_xfer_handles_by_tp_ratio: dict[int, list[int]] = {}
915
+ # Map of engine_id -> {tp_rank: nixl_prepped_dlist_handle (int)}.
916
+ self.dst_xfer_side_handles = defaultdict[EngineId, dict[int, int]](dict)
917
+
918
+ # Map of engine_id -> num_blocks. All ranks in the same deployment will
919
+ # have the same number of blocks.
920
+ self.dst_num_blocks: dict[EngineId, int] = {}
921
+ self._registered_descs: list[Any] = []
922
+
923
+ # In progress transfers.
924
+ # [req_id -> list[handle]]
925
+ self._recving_metadata: dict[ReqId, ReqMeta] = {}
926
+ self._recving_transfers = defaultdict[ReqId, list[TransferHandle]](list)
927
+ # Track the expiration time of requests that are waiting to be sent.
928
+ self._reqs_to_send: dict[ReqId, float] = {}
929
+ # Set of requests that have been part of a batch, regardless of status.
930
+ self._reqs_to_process: set[ReqId] = set()
931
+
932
+ # invalid blocks from failed NIXL operations
933
+ self._invalid_block_ids: set[int] = set()
934
+ # requests that skipped transfer (handshake or transfer failures)
935
+ self._failed_recv_reqs: set[ReqId] = set()
936
+
937
+ # Handshake metadata of this worker for NIXL transfers.
938
+ self.xfer_handshake_metadata: NixlHandshakePayload | None = None
939
+ # Background thread for initializing new NIXL handshakes.
940
+ self._handshake_initiation_executor = ThreadPoolExecutor(
941
+ # NIXL is not guaranteed to be thread-safe, limit 1 worker.
942
+ max_workers=1,
943
+ thread_name_prefix="vllm-nixl-handshake-initiator",
944
+ )
945
+ self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
946
+ self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {}
947
+ # Protects _handshake_futures and _remote_agents.
948
+ self._handshake_lock = threading.RLock()
949
+
950
+ self.block_size = vllm_config.cache_config.block_size
951
+ self.model_config = vllm_config.model_config
952
+ self.cache_config = vllm_config.cache_config
953
+
954
+ # TODO(mgoin): remove this once we have hybrid memory allocator
955
+ # Optimization for models with local attention (Llama 4)
956
+ # List of block window sizes for each layer for local attention
957
+ self.block_window_per_layer: list[int | None] = []
958
+ self.use_mla = self.model_config.use_mla
959
+
960
+ # Get the attention backend from the first layer
961
+ # NOTE (NickLucche) models with multiple backends are not supported yet
962
+ backend = get_current_attn_backend(vllm_config)
963
+
964
+ self.backend_name = backend.get_name()
965
+ self.kv_cache_layout = get_kv_cache_layout()
966
+ self.host_buffer_kv_cache_layout = self.kv_cache_layout
967
+ logger.debug("Detected attention backend %s", self.backend_name)
968
+ logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
969
+
970
+ self.compat_hash = compute_nixl_compatibility_hash(
971
+ self.vllm_config, self.backend_name
972
+ )
973
+ self.enforce_compat_hash = self.kv_transfer_config.get_from_extra_config(
974
+ "enforce_handshake_compat", True
975
+ )
976
+
977
+ self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
978
+ self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
979
+ # With heterogeneous TP, P must wait for all assigned D TP workers to
980
+ # finish reading before safely freeing the blocks.
981
+ self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
982
+ self.xfer_stats = NixlKVConnectorStats()
983
+
984
+ self.kv_topo = TpKVTopology(
985
+ tp_rank=self.tp_rank,
986
+ engine_id=self.engine_id,
987
+ remote_tp_size=self._tp_size, # shared state
988
+ remote_block_size=self._block_size, # shared state
989
+ is_mla=self.use_mla,
990
+ total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
991
+ attn_backend=backend,
992
+ )
993
+ self._physical_blocks_per_logical_kv_block = 1
994
+
995
+ def _nixl_handshake(
996
+ self,
997
+ host: str,
998
+ port: int,
999
+ remote_tp_size: int,
1000
+ expected_engine_id: str,
1001
+ ) -> dict[int, str]:
1002
+ """Do a NIXL handshake with a remote instance."""
1003
+ # When target instance TP > local TP, we need to perform multiple
1004
+ # handshakes. Do it in a single background job for simplicity.
1005
+ # Regardless, only handshake with the remote TP rank(s) that current
1006
+ # local rank will read from. Note that With homogeneous TP,
1007
+ # this happens to be the same single rank_i.
1008
+ p_remote_ranks = self.kv_topo.get_target_remote_ranks(remote_tp_size)
1009
+ remote_rank_to_agent_name = {}
1010
+ path = make_zmq_path("tcp", host, port)
1011
+
1012
+ with zmq_ctx(zmq.REQ, path) as sock:
1013
+ for remote_rank in p_remote_ranks:
1014
+ logger.debug(
1015
+ "Querying metadata on path: %s at remote tp rank %s",
1016
+ path,
1017
+ remote_rank,
1018
+ )
1019
+
1020
+ start_time = time.perf_counter()
1021
+ # Send query for the request.
1022
+ msg = msgspec.msgpack.encode((GET_META_MSG, remote_rank))
1023
+ # Set receive timeout to 5 seconds to avoid hanging on dead server
1024
+ sock.setsockopt(zmq.RCVTIMEO, 5000) # milliseconds
1025
+ sock.send(msg)
1026
+ handshake_bytes = sock.recv()
1027
+
1028
+ # Decode handshake payload to get compatibility hash
1029
+ handshake_decoder = msgspec.msgpack.Decoder(NixlHandshakePayload)
1030
+ try:
1031
+ handshake_payload = handshake_decoder.decode(handshake_bytes)
1032
+ except (msgspec.DecodeError, msgspec.ValidationError) as e:
1033
+ raise RuntimeError(
1034
+ f"Failed to decode NixlHandshakePayload. This likely indicates "
1035
+ f"an incompatibility between connector version. Error: {e}"
1036
+ ) from e
1037
+
1038
+ got_metadata_time = time.perf_counter()
1039
+ logger.debug(
1040
+ "NIXL handshake: get metadata took: %s",
1041
+ got_metadata_time - start_time,
1042
+ )
1043
+
1044
+ # Check compatibility hash BEFORE decoding agent metadata
1045
+ if (
1046
+ self.enforce_compat_hash
1047
+ and handshake_payload.compatibility_hash != self.compat_hash
1048
+ ):
1049
+ raise RuntimeError(
1050
+ f"NIXL compatibility hash mismatch. "
1051
+ f"Local: {self.compat_hash}, "
1052
+ f"Remote: {handshake_payload.compatibility_hash}. "
1053
+ f"Prefill and decode instances have incompatible "
1054
+ f"configurations. This may be due to: different vLLM versions,"
1055
+ f" models, dtypes, KV cache layouts, attention backends, etc. "
1056
+ f"Both instances must use identical configurations."
1057
+ f"Disable this check using "
1058
+ f'--kv-transfer-config \'{{"kv_connector_extra_config": '
1059
+ f'{{"enforce_handshake_compat": false}}}}\''
1060
+ )
1061
+
1062
+ logger.info(
1063
+ "NIXL compatibility check passed (hash: %s)",
1064
+ handshake_payload.compatibility_hash,
1065
+ )
1066
+
1067
+ # Decode agent metadata
1068
+ metadata_decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
1069
+ try:
1070
+ metadata = metadata_decoder.decode(
1071
+ handshake_payload.agent_metadata_bytes
1072
+ )
1073
+ except (msgspec.DecodeError, msgspec.ValidationError) as e:
1074
+ # This should not happen if hash matched
1075
+ raise RuntimeError(
1076
+ f"Failed to decode NixlAgentMetadata. Error: {e}"
1077
+ ) from e
1078
+
1079
+ # Ensure engine id matches.
1080
+ if metadata.engine_id != expected_engine_id:
1081
+ raise RuntimeError(
1082
+ f"Remote NIXL agent engine ID mismatch. "
1083
+ f"Expected {expected_engine_id},"
1084
+ f"received {metadata.engine_id}."
1085
+ )
1086
+ setup_agent_time = time.perf_counter()
1087
+
1088
+ # Register Remote agent.
1089
+ remote_agent_name = self.add_remote_agent(
1090
+ metadata, remote_rank, remote_tp_size
1091
+ )
1092
+ logger.debug(
1093
+ "NIXL handshake: add agent took: %s",
1094
+ setup_agent_time - got_metadata_time,
1095
+ )
1096
+ remote_rank_to_agent_name[remote_rank] = remote_agent_name
1097
+ return remote_rank_to_agent_name
1098
+
1099
+ def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None:
1100
+ """
1101
+ Initialize transfer buffer in CPU mem for accelerators
1102
+ NOT directly supported by NIXL (e.g., tpu)
1103
+ """
1104
+ xfer_buffers: dict[str, torch.Tensor] = {}
1105
+ inv_order = [0, 1, 3, 2, 4]
1106
+ try:
1107
+ for layer_name, kv_cache in kv_caches.items():
1108
+ kv_shape = kv_cache.shape
1109
+ kv_dtype = kv_cache.dtype
1110
+ permute_shape = False
1111
+ if (
1112
+ self.kv_cache_layout == "NHD"
1113
+ and self.vllm_config.kv_transfer_config is not None
1114
+ and self.vllm_config.kv_transfer_config.enable_permute_local_kv
1115
+ ):
1116
+ logger.info_once(
1117
+ "'enable_permute_local_kv' flag is enabled while "
1118
+ "device KV Layout is NHD. Init host buffer with"
1119
+ " HND to better support Decode/Prefill TP_ratio > 1."
1120
+ )
1121
+ # Since NHD will not support Decode/Prefill TP_ratio > 1,
1122
+ # we can leverage host_buffer for permute
1123
+ self.host_buffer_kv_cache_layout = "HND"
1124
+ kv_shape = (
1125
+ tuple(kv_shape[i] for i in inv_order)
1126
+ if not self.use_mla
1127
+ else kv_shape
1128
+ )
1129
+ permute_shape = not self.use_mla
1130
+
1131
+ xfer_buffers[layer_name] = torch.empty(
1132
+ kv_shape, dtype=kv_dtype, device="cpu"
1133
+ )
1134
+ if permute_shape:
1135
+ xfer_buffers[layer_name] = xfer_buffers[layer_name].permute(
1136
+ inv_order
1137
+ )
1138
+ except MemoryError as e:
1139
+ logger.error("NIXLConnectorWorker gets %s.", e)
1140
+ raise
1141
+
1142
+ self.host_xfer_buffers = xfer_buffers
1143
+
1144
+ def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
1145
+ """Assign copy (d2h, h2d) operations when host buffer is used."""
1146
+ # Set a no-op if the host buffer is not cpu.
1147
+ if self.kv_buffer_device != "cpu":
1148
+ return
1149
+ # Set a no-op if self.device_type is 'cpu'.
1150
+ if self.device_type == "cpu":
1151
+ return
1152
+ assert self.use_host_buffer
1153
+ self.copy_blocks = copy_operation
1154
+
1155
+ def _log_failure(
1156
+ self,
1157
+ failure_type: str,
1158
+ req_id: str | None,
1159
+ msg: str = "",
1160
+ error: Exception | None = None,
1161
+ meta: ReqMeta | None = None,
1162
+ **extra_context,
1163
+ ):
1164
+ """Log transfer failure with structured context for easier debugging."""
1165
+ context: dict[str, Any] = {
1166
+ "failure_type": failure_type,
1167
+ "request_id": req_id,
1168
+ "engine_id": self.engine_id,
1169
+ }
1170
+ if meta is None and req_id is not None:
1171
+ # Try to get metadata from in progress transfers when not provided
1172
+ meta = self._recving_metadata.get(req_id)
1173
+
1174
+ if meta and meta.remote:
1175
+ context.update(
1176
+ {
1177
+ "remote_engine_id": meta.remote.engine_id,
1178
+ "remote_request_id": meta.remote.request_id,
1179
+ "remote_host": meta.remote.host,
1180
+ "remote_port": meta.remote.port,
1181
+ "num_local_blocks": len(meta.local_block_ids),
1182
+ "num_remote_blocks": len(meta.remote.block_ids),
1183
+ "local_block_ids_sample": meta.local_block_ids[:10],
1184
+ }
1185
+ )
1186
+
1187
+ context.update(extra_context)
1188
+ if msg:
1189
+ failure_type = f"{failure_type}. {msg}"
1190
+
1191
+ logger.error(
1192
+ "NIXL transfer failure: %s | Context: %s",
1193
+ failure_type,
1194
+ context,
1195
+ exc_info=error is not None,
1196
+ stacklevel=2,
1197
+ )
1198
+
1199
+ def _background_nixl_handshake(
1200
+ self, req_id: str, remote_engine_id: EngineId, meta: ReqMeta
1201
+ ):
1202
+ # Do NIXL handshake in background and add to _ready_requests when done.
1203
+ fut = self._handshake_futures.get(remote_engine_id)
1204
+ if fut is None:
1205
+ assert meta.remote is not None
1206
+ fut = self._handshake_initiation_executor.submit(
1207
+ self._nixl_handshake,
1208
+ meta.remote.host,
1209
+ meta.remote.port,
1210
+ meta.tp_size,
1211
+ remote_engine_id,
1212
+ )
1213
+ self._handshake_futures[remote_engine_id] = fut
1214
+
1215
+ def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
1216
+ with self._handshake_lock:
1217
+ del self._handshake_futures[eid]
1218
+ try:
1219
+ self._remote_agents[eid] = f.result()
1220
+ except Exception as e:
1221
+ self._log_failure(
1222
+ failure_type="handshake_setup_failed",
1223
+ req_id=None,
1224
+ error=e,
1225
+ remote_engine_id=eid,
1226
+ )
1227
+
1228
+ fut.add_done_callback(done_callback)
1229
+
1230
+ # check handshake success before proceeding with request
1231
+ def request_ready(f: Future[Any], entry=(req_id, meta)):
1232
+ try:
1233
+ # check if handshake succeeded
1234
+ f.result()
1235
+ self._ready_requests.put(entry)
1236
+ except Exception as e:
1237
+ # handshake failed - mark blocks as invalid
1238
+ self._log_failure(
1239
+ failure_type="handshake_failed",
1240
+ req_id=req_id,
1241
+ error=e,
1242
+ meta=meta,
1243
+ )
1244
+ if req_meta := self._recving_metadata.get(req_id):
1245
+ self._invalid_block_ids.update(req_meta.local_block_ids)
1246
+ self._failed_recv_reqs.add(req_id)
1247
+
1248
+ fut.add_done_callback(request_ready)
1249
+
1250
+ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
1251
+ """Register the KV Cache data in nixl."""
1252
+
1253
+ if self.use_host_buffer:
1254
+ self.initialize_host_xfer_buffer(kv_caches=kv_caches)
1255
+ assert len(self.host_xfer_buffers) == len(kv_caches), (
1256
+ f"host_buffer: {len(self.host_xfer_buffers)}, "
1257
+ f"kv_caches: {len(kv_caches)}"
1258
+ )
1259
+ xfer_buffers = self.host_xfer_buffers
1260
+ else:
1261
+ xfer_buffers = kv_caches
1262
+ assert not self.host_xfer_buffers, (
1263
+ "host_xfer_buffer should not be initialized when "
1264
+ f"kv_buffer_device is {self.kv_buffer_device}"
1265
+ )
1266
+
1267
+ logger.info(
1268
+ "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, "
1269
+ "use_host_buffer: %s",
1270
+ self.use_mla,
1271
+ self.kv_buffer_device,
1272
+ self.use_host_buffer,
1273
+ )
1274
+
1275
+ caches_data = []
1276
+ # With hybrid allocator, layers can share a kv cache tensor
1277
+ seen_base_addresses = []
1278
+
1279
+ # Note(tms): I modified this from the original region setup code.
1280
+ # K and V are now in different regions. Advantage is that we can
1281
+ # elegantly support MLA and any cases where the K and V tensors
1282
+ # are non-contiguous (it's not locally guaranteed that they will be)
1283
+ # Disadvantage is that the encoded NixlAgentMetadata is now larger
1284
+ # (roughly 8KB vs 5KB).
1285
+ # Conversely for FlashInfer, K and V are registered in the same region
1286
+ # to better exploit the memory layout (ie num_blocks is the first dim).
1287
+ split_k_and_v = self.kv_topo.split_k_and_v
1288
+ tensor_size_bytes = None
1289
+
1290
+ # TODO (NickLucche): Get kernel_block_size in a cleaner way
1291
+ # NHD default "view" for non-MLA cache
1292
+ if self.device_type == "cpu":
1293
+ block_size_position = -2
1294
+ else:
1295
+ block_size_position = -2 if self.use_mla else -3
1296
+
1297
+ # Enable different block lengths for different layers when MLA is used.
1298
+ self.block_len_per_layer = list[int]()
1299
+ self.slot_size_per_layer = list[int]() # HD bytes in kv terms
1300
+ for layer_name, cache_or_caches in xfer_buffers.items():
1301
+ cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
1302
+
1303
+ for cache in cache_list:
1304
+ base_addr = cache.data_ptr()
1305
+ if base_addr in seen_base_addresses:
1306
+ continue
1307
+
1308
+ kernel_block_size = cache.shape[block_size_position]
1309
+
1310
+ if self.block_size != kernel_block_size:
1311
+ logger.info_once(
1312
+ "User-specified logical block size (%s) does not match"
1313
+ " physical kernel block size (%s). Using the latter. ",
1314
+ self.block_size,
1315
+ kernel_block_size,
1316
+ )
1317
+ self._physical_blocks_per_logical_kv_block = (
1318
+ self.block_size // kernel_block_size
1319
+ )
1320
+ self.block_size = kernel_block_size
1321
+ self._block_size[self.engine_id] = kernel_block_size
1322
+
1323
+ seen_base_addresses.append(base_addr)
1324
+ curr_tensor_size_bytes = cache.numel() * cache.element_size()
1325
+
1326
+ if tensor_size_bytes is None:
1327
+ tensor_size_bytes = curr_tensor_size_bytes
1328
+ self.num_blocks = cache.shape[0]
1329
+
1330
+ assert cache.shape[0] == self.num_blocks, (
1331
+ "All kv cache tensors must have the same number of blocks"
1332
+ )
1333
+
1334
+ self.block_len_per_layer.append(
1335
+ curr_tensor_size_bytes // self.num_blocks
1336
+ )
1337
+ self.slot_size_per_layer.append(
1338
+ self.block_len_per_layer[-1] // self.block_size
1339
+ )
1340
+
1341
+ if not self.use_mla:
1342
+ # Different kv cache shape is not supported by HeteroTP
1343
+ assert tensor_size_bytes == curr_tensor_size_bytes, (
1344
+ "All kv cache tensors must have the same size"
1345
+ )
1346
+ # Need to make sure the device ID is non-negative for NIXL,
1347
+ # Torch uses -1 to indicate CPU tensors.
1348
+ self.device_id = max(cache.get_device(), 0)
1349
+ caches_data.append(
1350
+ (base_addr, curr_tensor_size_bytes, self.device_id, "")
1351
+ )
1352
+
1353
+ logger.debug(
1354
+ "Different block lengths collected: %s", set(self.block_len_per_layer)
1355
+ )
1356
+ assert len(self.block_len_per_layer) == len(seen_base_addresses)
1357
+ assert self.num_blocks != 0
1358
+
1359
+ self.kv_caches_base_addr[self.engine_id][self.tp_rank] = seen_base_addresses
1360
+ self.num_regions = len(caches_data)
1361
+ self.num_layers = len(xfer_buffers.keys())
1362
+
1363
+ descs = self.nixl_wrapper.get_reg_descs(caches_data, self.nixl_memory_type)
1364
+ logger.debug("Registering descs: %s", caches_data)
1365
+ self.nixl_wrapper.register_memory(descs, backends=self.nixl_backends)
1366
+ logger.debug("Done registering descs")
1367
+ self._registered_descs.append(descs)
1368
+
1369
+ self.device_kv_caches = kv_caches
1370
+ self.dst_num_blocks[self.engine_id] = self.num_blocks
1371
+ if self.kv_topo.is_kv_layout_blocks_first:
1372
+ for i in range(len(self.slot_size_per_layer)):
1373
+ assert self.slot_size_per_layer[i] % 2 == 0
1374
+ self.slot_size_per_layer[i] //= 2
1375
+
1376
+ # NOTE (NickLucche) When FlashInfer is used, memory is registered
1377
+ # with joint KV for each block. This minimizes the overhead in
1378
+ # registerMem allowing faster descs queries. In order to be able to
1379
+ # split on kv_heads dim as required by heterogeneous TP, one must
1380
+ # be able to index K/V separately. Hence we double the number
1381
+ # of 'virtual' regions here and halve `block_len` below.
1382
+ self.num_regions *= 2
1383
+
1384
+ # Register local/src descr for NIXL xfer.
1385
+ self.seen_base_addresses = seen_base_addresses
1386
+ self.src_xfer_handles_by_block_size[self.block_size], self.src_blocks_data = (
1387
+ self.register_local_xfer_handler(self.block_size)
1388
+ )
1389
+
1390
+ # TODO(mgoin): Hybrid memory allocator is currently disabled for
1391
+ # models with local attention (Llama 4). Can remove this once enabled.
1392
+ if self.model_config.hf_config.model_type == "llama4":
1393
+ from transformers import Llama4TextConfig
1394
+
1395
+ assert isinstance(self.model_config.hf_text_config, Llama4TextConfig)
1396
+ llama4_config = self.model_config.hf_text_config
1397
+ no_rope_layers = llama4_config.no_rope_layers
1398
+ chunk_size = llama4_config.attention_chunk_size
1399
+ chunk_block_size = math.ceil(chunk_size / self.block_size)
1400
+ for layer_idx in range(self.num_layers):
1401
+ # no_rope_layers[layer_idx] == 0 means NoPE (global)
1402
+ # Any other value means RoPE (local chunked)
1403
+ is_local_attention = no_rope_layers[layer_idx] != 0
1404
+ block_window = chunk_block_size if is_local_attention else None
1405
+ self.block_window_per_layer.append(block_window)
1406
+ logger.debug(
1407
+ "Llama 4 block window per layer mapping: %s",
1408
+ self.block_window_per_layer,
1409
+ )
1410
+ assert len(self.block_window_per_layer) == self.num_layers
1411
+
1412
+ # After KV Caches registered, listen for new connections.
1413
+ agent_metadata = NixlAgentMetadata(
1414
+ engine_id=self.engine_id,
1415
+ agent_metadata=self.nixl_wrapper.get_agent_metadata(),
1416
+ device_id=self.device_id,
1417
+ kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id][self.tp_rank],
1418
+ num_blocks=self.num_blocks,
1419
+ block_lens=self.block_len_per_layer,
1420
+ kv_cache_layout=self.kv_cache_layout
1421
+ if not self.use_host_buffer
1422
+ else self.host_buffer_kv_cache_layout,
1423
+ block_size=self.block_size,
1424
+ )
1425
+ # Wrap metadata in payload with hash for defensive decoding
1426
+ encoder = msgspec.msgpack.Encoder()
1427
+ self.xfer_handshake_metadata = NixlHandshakePayload(
1428
+ compatibility_hash=self.compat_hash,
1429
+ agent_metadata_bytes=encoder.encode(agent_metadata),
1430
+ )
1431
+
1432
+ def register_local_xfer_handler(
1433
+ self,
1434
+ block_size: int,
1435
+ ) -> tuple[int, list[tuple[int, int, int]]]:
1436
+ """
1437
+ Function used for register local xfer handler with local block_size or
1438
+ Remote block_size.
1439
+
1440
+ When local block_size is same as remote block_size, we use local block_size
1441
+ to register local_xfer_handler during init.
1442
+
1443
+ When remote block size is less than local block size, we need to use
1444
+ register another local_xfer_handler using remote block len to ensure
1445
+ data copy correctness.
1446
+ """
1447
+ block_size_ratio = self.block_size // block_size
1448
+ blocks_data = []
1449
+ for i, base_addr in enumerate(self.seen_base_addresses):
1450
+ # The new block_len is using prefill block_len;
1451
+ # and num_blocks is multiple with N
1452
+ kv_block_len = (
1453
+ self.get_backend_aware_kv_block_len(layer_idx=i) // block_size_ratio
1454
+ )
1455
+ block_len_per_layer = self.block_len_per_layer[i] // block_size_ratio
1456
+ num_blocks = self.num_blocks * block_size_ratio
1457
+ for block_id in range(num_blocks):
1458
+ block_offset = block_id * block_len_per_layer
1459
+ addr = base_addr + block_offset
1460
+ # (addr, len, device id)
1461
+ blocks_data.append((addr, kv_block_len, self.device_id))
1462
+
1463
+ if self.kv_topo.is_kv_layout_blocks_first:
1464
+ # Separate and interleave K/V regions to maintain the same
1465
+ # descs ordering. This is needed for selecting contiguous heads
1466
+ # when split across TP ranks.
1467
+ for block_id in range(num_blocks):
1468
+ block_offset = block_id * block_len_per_layer
1469
+ addr = base_addr + block_offset
1470
+ # Register addresses for V cache (K registered first).
1471
+ v_addr = addr + kv_block_len
1472
+ blocks_data.append((v_addr, kv_block_len, self.device_id))
1473
+ logger.debug(
1474
+ "Created %s blocks for src engine %s and rank %s on device id %s",
1475
+ len(blocks_data),
1476
+ self.engine_id,
1477
+ self.tp_rank,
1478
+ self.device_id,
1479
+ )
1480
+
1481
+ descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
1482
+ # NIXL_INIT_AGENT to be used for preparations of local descs.
1483
+ return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs), blocks_data
1484
+
1485
+ def add_remote_agent(
1486
+ self,
1487
+ nixl_agent_meta: NixlAgentMetadata,
1488
+ remote_tp_rank: int = 0,
1489
+ remote_tp_size: int = 1,
1490
+ ) -> str:
1491
+ """
1492
+ Add the remote NIXL agent and prepare the descriptors for reading cache
1493
+ blocks from remote.
1494
+
1495
+ In particular, handle both homogeneous and heterogeneous TP. The former
1496
+ requires local rank_i to read from remote rank_i.
1497
+ The latter, in the case of D.world_size < P.world_size, requires that a
1498
+ local (D) TP worker reads from multiple remote (P) TP workers.
1499
+ Conversely, assuming D.world_size > P.world_size, two or more local TP
1500
+ workers will read from a single remote TP worker.
1501
+
1502
+ Here's an example for the last case described above (non-MLA):
1503
+
1504
+ rank_offset p_remote_tp_rank
1505
+ (kv split no)
1506
+ --------------------------------
1507
+ 0 0 Worker0 ---- 1st half of KV ----> Worker0 [ KV Cache ]
1508
+ /
1509
+ 1 0 Worker1 ---- 2nd half of KV -----/
1510
+
1511
+ 0 1 Worker2 ---- 1st half of KV ----> Worker1 [ KV Cache ]
1512
+ /
1513
+ 1 1 Worker3 ---- 2nd half of KV -----/
1514
+
1515
+
1516
+ Decoder TP workers Prefix TP workers
1517
+ (world_size=4) (world_size=2)
1518
+ tp_ratio = 4 // 2 = 2
1519
+
1520
+ Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]
1521
+ then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
1522
+ Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio
1523
+ first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
1524
+ along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.
1525
+
1526
+ Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
1527
+
1528
+ Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
1529
+ so that the whole cache is shared by "tp_ratio" D TP workers.
1530
+ """ # noqa: E501
1531
+ engine_id = nixl_agent_meta.engine_id
1532
+ # TODO re-evaluate refreshing for scaling/recovery
1533
+ if remote_tp_rank in self._remote_agents.get(engine_id, {}):
1534
+ logger.debug(
1535
+ "Remote agent with engine_id %s and rank"
1536
+ "%s already exchanged metadata, skip handshake.",
1537
+ engine_id,
1538
+ remote_tp_rank,
1539
+ )
1540
+ return self._remote_agents[engine_id][remote_tp_rank]
1541
+
1542
+ ### Register remote agent metadata
1543
+ if engine_id not in self._tp_size:
1544
+ self._tp_size[engine_id] = remote_tp_size
1545
+ if engine_id not in self._block_size:
1546
+ self._block_size[engine_id] = nixl_agent_meta.block_size
1547
+
1548
+ remote_agent_name = self.nixl_wrapper.add_remote_agent(
1549
+ nixl_agent_meta.agent_metadata
1550
+ )
1551
+
1552
+ # Create dst descs and xfer side handles. TP workers have same #blocks
1553
+ # so we only register once per engine_id.
1554
+ # Example:
1555
+ # block_size_ratio > 1:
1556
+ # remote: | 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|
1557
+ # local origin:| 0| 1| 8| 12|
1558
+ # local mapped:| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|
1559
+ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(engine_id)
1560
+
1561
+ if engine_id not in self.dst_num_blocks:
1562
+ self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
1563
+
1564
+ # Keep track of remote agent kv caches base addresses.
1565
+ self.kv_caches_base_addr[engine_id][remote_tp_rank] = (
1566
+ nixl_agent_meta.kv_caches_base_addr
1567
+ )
1568
+ self._validate_remote_agent_handshake(nixl_agent_meta, remote_tp_size)
1569
+
1570
+ # This is 1 when P and D `--tensor-parallel-size` match. Otherwise,
1571
+ # this is the ratio between the two sizes.
1572
+ tp_ratio = self.kv_topo.tp_ratio_from_engine_id(engine_id)
1573
+
1574
+ # Handle tp_size>num_kv_heads: replicate KV cache.
1575
+ indexes_into_remote = (
1576
+ not self.kv_topo.replicates_kv_cache(engine_id) and tp_ratio > 0
1577
+ )
1578
+
1579
+ logger.debug(
1580
+ "Registering remote agent (%s, rank %s) memory regions with tp_ratio %s",
1581
+ engine_id,
1582
+ remote_tp_rank,
1583
+ tp_ratio,
1584
+ )
1585
+
1586
+ ### (Optional) Register local agent memory regions. MLA is not split.
1587
+ if (
1588
+ tp_ratio < 0
1589
+ and not self.use_mla
1590
+ and tp_ratio not in self.src_xfer_handles_by_tp_ratio
1591
+ ):
1592
+ # Remote tp_size > local tp_size: read from multiple remote ranks.
1593
+ # Logically "split" own regions into |tp_ratio| chunks. Mind that
1594
+ # we only do this once per remote tp_size (replica-friendly).
1595
+ self.src_xfer_handles_by_tp_ratio[tp_ratio] = []
1596
+ for i in range(-tp_ratio):
1597
+ blocks_data = []
1598
+ for memory_region in self.src_blocks_data:
1599
+ addr, local_block_len, own_tp_rank = memory_region
1600
+ # Computing block len layer by layer allows for different
1601
+ # block sizes to be used.
1602
+ remote_block_len = local_block_len // (-tp_ratio)
1603
+ addr = addr + i * remote_block_len
1604
+ blocks_data.append((addr, remote_block_len, own_tp_rank))
1605
+ descs = self.nixl_wrapper.get_xfer_descs(
1606
+ blocks_data, self.nixl_memory_type
1607
+ )
1608
+ handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
1609
+ self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
1610
+
1611
+ ### Register remote agent memory regions
1612
+ blocks_data = []
1613
+ # With homogeneous TP, D pulls the whole kv cache from corresponding
1614
+ # rank. With heterogeneous TP, prepare the descriptors by splitting the
1615
+ # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
1616
+ # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
1617
+
1618
+ # Register all remote blocks, but only the corresponding kv heads.
1619
+ for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
1620
+ # Read our whole local region size from remote.
1621
+ local_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
1622
+ remote_kv_block_len = local_block_len // block_size_ratio
1623
+ if block_size_ratio > 1:
1624
+ # using remote kv_block_len as transfer unit
1625
+ local_block_len = remote_kv_block_len
1626
+
1627
+ if tp_ratio < 0 and not self.use_mla:
1628
+ # Remote tp is bigger: read a chunk of local region from remote
1629
+ local_block_len = local_block_len // (-tp_ratio)
1630
+ rank_offset = (
1631
+ self.tp_rank % tp_ratio * remote_kv_block_len
1632
+ if indexes_into_remote
1633
+ else 0
1634
+ )
1635
+ for block_id in range(nixl_agent_meta.num_blocks):
1636
+ block_offset = block_id * nixl_agent_meta.block_lens[i]
1637
+ # For each block, grab the heads chunk belonging to rank_i
1638
+ # of size remote_nheads // tp_ratio, which correspond to
1639
+ # self.block_len == remote_block_len//tp_ratio bytes.
1640
+ addr = base_addr + block_offset + rank_offset
1641
+ # (addr, len, device id)
1642
+ blocks_data.append((addr, local_block_len, nixl_agent_meta.device_id))
1643
+
1644
+ if self.kv_topo.is_kv_layout_blocks_first:
1645
+ # With FlashInfer index V separately to allow head splitting.
1646
+ for block_id in range(nixl_agent_meta.num_blocks):
1647
+ block_offset = block_id * nixl_agent_meta.block_lens[i]
1648
+ addr = base_addr + block_offset + rank_offset
1649
+ v_addr = addr + nixl_agent_meta.block_lens[i] // 2
1650
+ blocks_data.append(
1651
+ (v_addr, local_block_len, nixl_agent_meta.device_id)
1652
+ )
1653
+
1654
+ logger.debug(
1655
+ "Created %s blocks for dst engine %s with remote rank %s and local rank %s",
1656
+ len(blocks_data),
1657
+ engine_id,
1658
+ remote_tp_rank,
1659
+ self.tp_rank,
1660
+ )
1661
+
1662
+ # Register with NIXL.
1663
+ descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
1664
+ self.dst_xfer_side_handles[engine_id][remote_tp_rank] = (
1665
+ self.nixl_wrapper.prep_xfer_dlist(remote_agent_name, descs)
1666
+ )
1667
+
1668
+ if block_size_ratio > 1:
1669
+ # when prefill with smaller block_size, we need to init a
1670
+ # new handler with same block_len to match
1671
+ self.src_xfer_handles_by_block_size[nixl_agent_meta.block_size] = (
1672
+ self.register_local_xfer_handler(nixl_agent_meta.block_size)[0]
1673
+ )
1674
+
1675
+ return remote_agent_name
1676
+
1677
+ def _validate_remote_agent_handshake(
1678
+ self, nixl_agent_meta: NixlAgentMetadata, remote_tp_size: int
1679
+ ):
1680
+ """
1681
+ Validate the remote agent handshake metadata ensuring the
1682
+ invariants hold true.
1683
+ """
1684
+ remote_engine_id = nixl_agent_meta.engine_id
1685
+
1686
+ assert self._tp_size[remote_engine_id] == remote_tp_size
1687
+
1688
+ tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
1689
+ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
1690
+ remote_engine_id
1691
+ )
1692
+ # Num kv_heads > tp_size and P TP > D TP case, not supported
1693
+ assert not (tp_ratio < 0 and self.kv_topo.is_kv_replicated(remote_engine_id))
1694
+
1695
+ kv_cache_layout = (
1696
+ self.kv_cache_layout
1697
+ if not self.use_host_buffer
1698
+ else self.host_buffer_kv_cache_layout
1699
+ )
1700
+ if not self.use_mla and nixl_agent_meta.kv_cache_layout != kv_cache_layout:
1701
+ if (
1702
+ self.kv_transfer_config.enable_permute_local_kv
1703
+ and nixl_agent_meta.kv_cache_layout == "HND"
1704
+ ):
1705
+ logger.info(
1706
+ "Remote is HND and local is NHD, enabled additional permute "
1707
+ "on local device KV."
1708
+ )
1709
+ self.enable_permute_local_kv = True
1710
+ else:
1711
+ raise RuntimeError(
1712
+ "Heterogeneous TP expects same kv_cache_layout. "
1713
+ "Or enable experimental feature to use HND to NHD support by "
1714
+ "setting 'enable_permute_local_kv'=True in --kv-transfer-config."
1715
+ )
1716
+
1717
+ # Block len can only vary across layers when using MLA.
1718
+ remote_block_len = nixl_agent_meta.block_lens[0]
1719
+ if self.use_mla or self.kv_topo.is_kv_replicated(remote_engine_id):
1720
+ # With replicated KV cache, only the number of blocks can differ.
1721
+ for i in range(len(self.block_len_per_layer)):
1722
+ assert (
1723
+ self.block_len_per_layer[i] // block_size_ratio
1724
+ == nixl_agent_meta.block_lens[i]
1725
+ ), "KV cache sizes must match between P and D when replicated"
1726
+ else:
1727
+ # When MLA is not used, this is a list of the same block length
1728
+ for block_len in nixl_agent_meta.block_lens:
1729
+ assert block_len == remote_block_len, (
1730
+ "All remote layers must have the same block size"
1731
+ )
1732
+
1733
+ if tp_ratio > 0:
1734
+ # Remote tp is smaller: remote block_len size is bigger
1735
+ assert (
1736
+ remote_block_len
1737
+ == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
1738
+ ), (
1739
+ "Remote P worker KV layer cache must be of shape [2, N, "
1740
+ "local_kv_heads*tp_ratio, page_size, head_dim] and same dtype."
1741
+ ) # noqa: E501
1742
+ else:
1743
+ assert block_size_ratio == 1, (
1744
+ "Different local/remote block sizes are not supported when"
1745
+ " P TP > D TP."
1746
+ )
1747
+ # Remote tp is bigger: remote block_len size is smaller
1748
+ assert remote_block_len == self.block_len_per_layer[0] // (-tp_ratio), (
1749
+ "Remote P worker KV layer cache must be of shape [2, N, "
1750
+ "local_kv_heads/tp_ratio, page_size, head_dim] and same dtype."
1751
+ ) # noqa: E501
1752
+
1753
+ # TP workers that handhshake with same remote have same #blocks.
1754
+ assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
1755
+ # Same number of regions/~layers.
1756
+ assert len(nixl_agent_meta.kv_caches_base_addr) == len(self.block_len_per_layer)
1757
+
1758
+ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
1759
+ """copy recved kv from host buffer to device."""
1760
+ assert self.use_host_buffer
1761
+ assert self.copy_blocks is not None
1762
+
1763
+ local_block_ids = meta.local_physical_block_ids
1764
+ self.copy_blocks(
1765
+ self.host_xfer_buffers,
1766
+ self.device_kv_caches,
1767
+ local_block_ids,
1768
+ local_block_ids,
1769
+ "h2d",
1770
+ )
1771
+ if logger.isEnabledFor(logging.DEBUG):
1772
+ logger.debug(
1773
+ "synced recved kv of request[%s] to device kv buffer,"
1774
+ "local_block_ids: %s. ",
1775
+ req_id,
1776
+ ",".join(map(str, local_block_ids)),
1777
+ )
1778
+
1779
+ def save_kv_to_host(self, metadata: NixlConnectorMetadata):
1780
+ """copy kv from device to host buffer."""
1781
+ assert self.use_host_buffer
1782
+ assert self.copy_blocks is not None
1783
+
1784
+ for req_id, meta in metadata.reqs_to_save.items():
1785
+ meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
1786
+ meta.local_block_ids
1787
+ )
1788
+ if logger.isEnabledFor(logging.DEBUG):
1789
+ logger.debug(
1790
+ "save_load_kv for request[%s] to host xfer buffer."
1791
+ "local_block_ids: %s. ",
1792
+ req_id,
1793
+ ",".join(map(str, meta.local_physical_block_ids)),
1794
+ )
1795
+ # blocking
1796
+ self.copy_blocks(
1797
+ self.device_kv_caches,
1798
+ self.host_xfer_buffers,
1799
+ meta.local_physical_block_ids,
1800
+ meta.local_physical_block_ids,
1801
+ "d2h",
1802
+ )
1803
+
1804
+ def post_process_device_kv_on_receive(
1805
+ self,
1806
+ block_size_ratio: int,
1807
+ block_ids_list: list[list[int]],
1808
+ ):
1809
+ """
1810
+ Post process device kv cache after receiving from remote.
1811
+
1812
+ 3 types of post processing supported:
1813
+ * kv_cache_postprocess_layout => convert from HND to NHD
1814
+ * kv_cache_postprocess_blksize => convert from small block size
1815
+ to large block size
1816
+ * kv_cache_postprocess_blksize_and_layout => convert from small
1817
+ block size to large block size and convert from HND to NHD
1818
+
1819
+ """
1820
+ if len(self.device_kv_caches) == 0:
1821
+ return
1822
+ assert block_size_ratio >= 1, "Only nP < nD supported currently."
1823
+ if self.enable_permute_local_kv and block_size_ratio > 1:
1824
+ logger.debug(
1825
+ "Post-processing device kv cache on receive by converting "
1826
+ "block_size with %sx bigger and permuting layout from HND"
1827
+ " to NHD.",
1828
+ block_size_ratio,
1829
+ )
1830
+ elif self.enable_permute_local_kv:
1831
+ logger.debug(
1832
+ "Post-processing device kv cache on receive by permuting layout"
1833
+ "from HND to NHD."
1834
+ )
1835
+ else:
1836
+ logger.debug(
1837
+ "Post-processing device kv cache on receive by converting "
1838
+ "block_size with %sx bigger.",
1839
+ block_size_ratio,
1840
+ )
1841
+
1842
+ split_k_and_v = not (self.use_mla or self.kv_topo.is_kv_layout_blocks_first)
1843
+
1844
+ for block_ids in block_ids_list:
1845
+ indices = torch.tensor(block_ids, device=self.device_type, dtype=torch.long)
1846
+
1847
+ for _, cache_or_caches in self.device_kv_caches.items():
1848
+ cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
1849
+ for cache in cache_list:
1850
+ if self.enable_permute_local_kv and block_size_ratio > 1:
1851
+ kv_postprocess_blksize_and_layout_on_receive(
1852
+ cache, indices, block_size_ratio
1853
+ )
1854
+ elif self.enable_permute_local_kv:
1855
+ kv_postprocess_layout_on_receive(cache, indices)
1856
+ else:
1857
+ kv_postprocess_blksize_on_receive(
1858
+ cache, indices, block_size_ratio
1859
+ )
1860
+
1861
+ def get_finished(self) -> tuple[set[str], set[str]]:
1862
+ """
1863
+ Get requests that are done sending or recving on this specific worker.
1864
+ The scheduler process (via the MultiprocExecutor) will use this output
1865
+ to track which workers are done.
1866
+ """
1867
+ done_sending = self._get_new_notifs()
1868
+ done_recving = self._pop_done_transfers(self._recving_transfers)
1869
+
1870
+ # add requests that skipped transfer to done_recving
1871
+ done_recving.update(self._failed_recv_reqs)
1872
+ self._failed_recv_reqs.clear()
1873
+
1874
+ if len(done_sending) > 0 or len(done_recving) > 0:
1875
+ logger.debug(
1876
+ "Rank %s, get_finished: %s requests done sending "
1877
+ "and %s requests done recving",
1878
+ self.tp_rank,
1879
+ len(done_sending),
1880
+ len(done_recving),
1881
+ )
1882
+
1883
+ block_ids_for_blocksize_post_process = defaultdict(list)
1884
+ for req_id in done_recving:
1885
+ # clean up metadata for completed requests
1886
+ meta = self._recving_metadata.pop(req_id, None)
1887
+ assert meta is not None, f"{req_id} not found in recving_metadata list"
1888
+ assert meta.remote is not None
1889
+ if self.use_host_buffer:
1890
+ self.sync_recved_kv_to_device(req_id, meta)
1891
+
1892
+ # post processing for heteroblocksize
1893
+ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
1894
+ meta.remote.engine_id
1895
+ )
1896
+ if not self.use_mla and (
1897
+ block_size_ratio > 1 or self.enable_permute_local_kv
1898
+ ):
1899
+ block_ids_for_blocksize_post_process[block_size_ratio].append(
1900
+ meta.local_physical_block_ids
1901
+ )
1902
+ for (
1903
+ block_size_ratio,
1904
+ block_ids_list,
1905
+ ) in block_ids_for_blocksize_post_process.items():
1906
+ self.post_process_device_kv_on_receive(block_size_ratio, block_ids_list)
1907
+
1908
+ # Handle timeout to avoid stranding blocks on remote.
1909
+ now = time.perf_counter()
1910
+ while self._reqs_to_send:
1911
+ req_id, expires = next(iter(self._reqs_to_send.items()))
1912
+ # Sorted dict, oldest requests are put first so we can exit early.
1913
+ if now < expires:
1914
+ break
1915
+ count = self.consumer_notification_counts_by_req.pop(req_id, 0)
1916
+ logger.warning(
1917
+ "Releasing expired KV blocks for request %s which were "
1918
+ "retrieved by %d decode worker(s) within %d seconds.",
1919
+ req_id,
1920
+ count,
1921
+ envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT,
1922
+ )
1923
+ self._reqs_to_process.remove(req_id)
1924
+ del self._reqs_to_send[req_id]
1925
+ done_sending.add(req_id)
1926
+
1927
+ return done_sending, done_recving
1928
+
1929
+ def _get_new_notifs(self) -> set[str]:
1930
+ """
1931
+ Get req_ids which got a remote xfer message. When multiple consumers
1932
+ are reading from the same producer (heterogeneous TP scenario), wait
1933
+ for all consumers to be done pulling.
1934
+ """
1935
+ notified_req_ids: set[str] = set()
1936
+ for notifs in self.nixl_wrapper.get_new_notifs().values():
1937
+ for notif in notifs:
1938
+ req_id, tp_size = notif.decode("utf-8").rsplit(":", 1)
1939
+ if (
1940
+ req_id not in self._reqs_to_send
1941
+ and req_id not in self._reqs_to_process
1942
+ ):
1943
+ logger.error(
1944
+ "Potentially invalid KV blocks for "
1945
+ "unrecognized request %s were retrieved by "
1946
+ "a decode worker. They may have expired.",
1947
+ req_id,
1948
+ )
1949
+ continue
1950
+
1951
+ # NOTE: `tp_ratio` is the opposite when swapping local<>remote
1952
+ n_consumers = int(tp_size)
1953
+ tp_ratio = self.kv_topo.tp_ratio(n_consumers)
1954
+
1955
+ # Number of reads *per producer* to wait for.
1956
+ # When remote D TP > local P TP we expect `tp_ratio` reads.
1957
+ consumers_per_producer = (
1958
+ -tp_ratio if n_consumers > self.world_size else 1
1959
+ )
1960
+
1961
+ self.consumer_notification_counts_by_req[req_id] += 1
1962
+ # Wait all consumers (D) to be done reading before freeing.
1963
+ if (
1964
+ self.consumer_notification_counts_by_req[req_id]
1965
+ == consumers_per_producer
1966
+ ):
1967
+ notified_req_ids.add(req_id)
1968
+ del self.consumer_notification_counts_by_req[req_id]
1969
+ self._reqs_to_process.remove(req_id)
1970
+ self._reqs_to_send.pop(req_id, None)
1971
+ return notified_req_ids
1972
+
1973
+ def _pop_done_transfers(self, transfers: dict[str, list[int]]) -> set[str]:
1974
+ """
1975
+ Pop completed xfers by checking for DONE state.
1976
+ Args:
1977
+ transfers: dict of req_id -> list[running_xfer]
1978
+ Returns:
1979
+ set of req_ids that have all done xfers
1980
+ """
1981
+ done_req_ids: set[str] = set()
1982
+ for req_id, handles in list(transfers.items()):
1983
+ in_progress = []
1984
+ for handle in handles:
1985
+ try:
1986
+ xfer_state = self.nixl_wrapper.check_xfer_state(handle)
1987
+ if xfer_state == "DONE":
1988
+ # Get telemetry from NIXL
1989
+ res = self.nixl_wrapper.get_xfer_telemetry(handle)
1990
+ self.xfer_stats.record_transfer(res)
1991
+ self.nixl_wrapper.release_xfer_handle(handle)
1992
+ elif xfer_state == "PROC":
1993
+ in_progress.append(handle)
1994
+ continue
1995
+ else:
1996
+ self._log_failure(
1997
+ failure_type="transfer_failed",
1998
+ msg="Marking blocks as invalid",
1999
+ req_id=req_id,
2000
+ xfer_state=xfer_state,
2001
+ )
2002
+ self._handle_failed_transfer(req_id, handle)
2003
+ except Exception as e:
2004
+ self._log_failure(
2005
+ failure_type="transfer_exception",
2006
+ msg="Marking blocks as invalid",
2007
+ req_id=req_id,
2008
+ error=e,
2009
+ )
2010
+ self._handle_failed_transfer(req_id, handle)
2011
+
2012
+ if not in_progress:
2013
+ # Only report request as completed when all transfers are done.
2014
+ done_req_ids.add(req_id)
2015
+ del transfers[req_id]
2016
+ else:
2017
+ transfers[req_id] = in_progress
2018
+ return done_req_ids
2019
+
2020
+ def _handle_failed_transfer(self, req_id: str, handle: int):
2021
+ """
2022
+ Handle a failed transfer by marking all (logical) blocks as invalid and
2023
+ recording the failure.
2024
+
2025
+ Args:
2026
+ req_id: The request ID.
2027
+ handle: The transfer handle.
2028
+ """
2029
+ # Use .get() here as the metadata cleanup is handled by get_finished()
2030
+ if meta := self._recving_metadata.get(req_id):
2031
+ self._invalid_block_ids.update(meta.local_block_ids)
2032
+ self.nixl_wrapper.release_xfer_handle(handle)
2033
+ self.xfer_stats.record_failed_transfer()
2034
+
2035
+ def start_load_kv(self, metadata: NixlConnectorMetadata):
2036
+ """
2037
+ Start loading by triggering non-blocking nixl_xfer.
2038
+ We check for these trnxs to complete in each step().
2039
+ """
2040
+ for req_id, meta in metadata.reqs_to_recv.items():
2041
+ meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
2042
+ meta.local_block_ids
2043
+ )
2044
+ assert meta.remote is not None
2045
+ meta.remote.block_ids = self._logical_to_kernel_block_ids(
2046
+ meta.remote.block_ids
2047
+ )
2048
+ remote_engine_id = meta.remote.engine_id
2049
+ logger.debug(
2050
+ "start_load_kv for request %s from remote engine %s. "
2051
+ "Num local_block_ids: %s. Num remote_block_ids: %s. ",
2052
+ req_id,
2053
+ remote_engine_id,
2054
+ len(meta.local_physical_block_ids),
2055
+ len(meta.remote.block_ids),
2056
+ )
2057
+ # always store metadata for failure recovery
2058
+ self._recving_metadata[req_id] = meta
2059
+ if remote_engine_id not in self._remote_agents:
2060
+ # Initiate handshake with remote engine to exchange metadata.
2061
+ with self._handshake_lock:
2062
+ if remote_engine_id not in self._remote_agents:
2063
+ self._background_nixl_handshake(req_id, remote_engine_id, meta)
2064
+ continue
2065
+
2066
+ # Handshake already completed, start async read xfer.
2067
+ self._read_blocks_for_req(req_id, meta)
2068
+
2069
+ # Start transfers for requests whose handshakes have now finished.
2070
+ while not self._ready_requests.empty():
2071
+ self._read_blocks_for_req(*self._ready_requests.get_nowait())
2072
+
2073
+ # Keep around the requests that have been part of a batch. This is
2074
+ # needed because async scheduling pushes the misalignment between the
2075
+ # moment in which requests expiration is set (P side) and the moment in
2076
+ # which blocks are read from D. As P can now more easily lag behind D
2077
+ # while processing the next batch, we make sure to only set an
2078
+ # expiration for requests that have not been read from D yet.
2079
+ for req_id in metadata.reqs_in_batch:
2080
+ self._reqs_to_process.add(req_id)
2081
+
2082
+ # Remove all requests that are not to be processed (eg aborted).
2083
+ for req_id in metadata.reqs_not_processed:
2084
+ self._reqs_to_process.discard(req_id)
2085
+ # We should never get an abort after setting an expiry timer
2086
+ assert req_id not in self._reqs_to_send
2087
+
2088
+ # Add to requests that are waiting to be read and track expiration.
2089
+ for req_id, expiration_time in metadata.reqs_to_send.items():
2090
+ if req_id in self._reqs_to_process:
2091
+ self._reqs_to_send[req_id] = expiration_time
2092
+
2093
+ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
2094
+ assert meta.remote is not None
2095
+ remote_ranks = self.kv_topo.get_target_remote_ranks_from_engine_id(
2096
+ meta.remote.engine_id
2097
+ )
2098
+ tp_ratio = self.kv_topo.tp_ratio_from_engine_id(meta.remote.engine_id)
2099
+ # D may have to perform multiple reads from different remote ranks.
2100
+ for i, remote_rank in enumerate(remote_ranks):
2101
+ if self.use_mla and tp_ratio < 0 and i > 0:
2102
+ # MLA opt: when P TP > D TP, only a single read is executed for
2103
+ # the first remote rank (cache is duplicated)..
2104
+ break
2105
+
2106
+ remote_block_size = self.kv_topo.remote_block_size[meta.remote.engine_id]
2107
+ logger.debug(
2108
+ "Remote agent %s available, calling _read_blocks"
2109
+ " on remote rank %s with remote block size %s for req %s",
2110
+ meta.remote.engine_id,
2111
+ remote_rank,
2112
+ remote_block_size,
2113
+ req_id,
2114
+ )
2115
+ # Get side handles.
2116
+ if tp_ratio < 0 and not self.use_mla:
2117
+ assert remote_block_size == self.block_size
2118
+ # Remote tp_size > local tp_size: we must perform multiple
2119
+ # reads. Get the memory chunk onto which we will write to.
2120
+ local_xfer_side_handle = self.src_xfer_handles_by_tp_ratio[tp_ratio][i]
2121
+ else:
2122
+ # Single read from remote, we write to the whole memory region.
2123
+ # Also handle remote block size different from local block size.
2124
+ local_xfer_side_handle = self.src_xfer_handles_by_block_size[
2125
+ remote_block_size
2126
+ ]
2127
+
2128
+ # Destination handle: remote_engine_id -> remote_rank -> handle.
2129
+ remote_xfer_side_handle = self.dst_xfer_side_handles[meta.remote.engine_id][
2130
+ remote_rank
2131
+ ]
2132
+ self._read_blocks(
2133
+ request_id=req_id,
2134
+ dst_engine_id=meta.remote.engine_id,
2135
+ remote_request_id=meta.remote.request_id,
2136
+ local_block_ids=meta.local_physical_block_ids,
2137
+ remote_block_ids=meta.remote.block_ids,
2138
+ remote_rank=remote_rank,
2139
+ local_xfer_side_handle=local_xfer_side_handle,
2140
+ remote_xfer_side_handle=remote_xfer_side_handle,
2141
+ )
2142
+
2143
+ if self.use_mla and tp_ratio < 0:
2144
+ # ..but we still need to notify the other remote ranks that we
2145
+ # have the blocks we need so they can update the request state.
2146
+ notif_id = f"{req_id}:{self.world_size}".encode()
2147
+ remote_agents = self._remote_agents[meta.remote.engine_id]
2148
+ for rank_to_notify, agent in remote_agents.items():
2149
+ if rank_to_notify != remote_rank:
2150
+ self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
2151
+
2152
+ def _read_blocks(
2153
+ self,
2154
+ local_block_ids: list[int],
2155
+ remote_block_ids: list[int],
2156
+ dst_engine_id: str,
2157
+ request_id: str,
2158
+ remote_request_id: str,
2159
+ remote_rank: int,
2160
+ local_xfer_side_handle: int,
2161
+ remote_xfer_side_handle: int,
2162
+ ):
2163
+ """
2164
+ Post a READ point-to-point xfer request from a single local worker to
2165
+ a single remote worker.
2166
+ """
2167
+ block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
2168
+ if block_size_ratio > 1:
2169
+ local_block_ids = self.get_mapped_blocks(
2170
+ np.asarray(local_block_ids), block_size_ratio
2171
+ )
2172
+ if len(local_block_ids) > len(remote_block_ids):
2173
+ # NOTE:
2174
+ # get_mapped_blocks will always expand block_ids for n times.
2175
+ # ex:
2176
+ # prefill block_ids with block_size as 4:
2177
+ # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
2178
+ # Local decode block_ids with block_size as 16: [1, 2, 3]
2179
+ # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to
2180
+ # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
2181
+ # Then we clip local to align with prefill
2182
+ # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to
2183
+ # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
2184
+ local_block_ids = local_block_ids[: len(remote_block_ids)]
2185
+ # NOTE(rob): having the staging blocks be on the READER side is
2186
+ # not going to work well (since we will have to call rearrange tensors).
2187
+ # after we detect the txn is complete (which means we cannot make the
2188
+ # read trxn async easily). If we want to make "READ" happen cleanly,
2189
+ # then we will need to have the staging blocks on the remote side.
2190
+
2191
+ # NOTE(rob): according to nvidia the staging blocks are used to
2192
+ # saturate IB with heterogeneous TP sizes. We should remove the staging
2193
+ # blocks until we are ready.
2194
+
2195
+ # Number of D TP workers that will read from dst P. Propagate info
2196
+ # on notification so that dst worker can wait before freeing blocks.
2197
+ notif_id = f"{remote_request_id}:{self.world_size}".encode()
2198
+
2199
+ # Full prefix cache hit: do not need to read remote blocks,
2200
+ # just notify P worker that we have the blocks we need.
2201
+ num_local_blocks = len(local_block_ids)
2202
+ if num_local_blocks == 0:
2203
+ agent_name = self._remote_agents[dst_engine_id][remote_rank]
2204
+ try:
2205
+ self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
2206
+ except Exception as e:
2207
+ self._log_failure(
2208
+ failure_type="notification_failed",
2209
+ msg="P worker blocks will be freed after timeout. "
2210
+ "This may indicate network issues.",
2211
+ req_id=request_id,
2212
+ error=e,
2213
+ dst_engine_id=dst_engine_id,
2214
+ remote_rank=remote_rank,
2215
+ remote_agent_name=agent_name,
2216
+ )
2217
+ self.xfer_stats.record_failed_notification()
2218
+ return
2219
+
2220
+ # Partial prefix cache hit: just read uncomputed blocks.
2221
+ num_remote_blocks = len(remote_block_ids)
2222
+ assert num_local_blocks <= num_remote_blocks
2223
+ if num_local_blocks < num_remote_blocks:
2224
+ remote_block_ids = remote_block_ids[-num_local_blocks:]
2225
+
2226
+ # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
2227
+ # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
2228
+ # workers will issue xfers to parts of the P worker remote kv caches.
2229
+
2230
+ # Get descs ids.
2231
+ local_block_descs_ids: np.ndarray
2232
+ remote_block_descs_ids: np.ndarray
2233
+
2234
+ if not self.block_window_per_layer:
2235
+ # Default case: assume global attention
2236
+ remote_block_descs_ids = self._get_block_descs_ids(
2237
+ dst_engine_id,
2238
+ remote_block_ids,
2239
+ )
2240
+ local_block_descs_ids = self._get_block_descs_ids(
2241
+ self.engine_id,
2242
+ local_block_ids,
2243
+ block_size_ratio=block_size_ratio,
2244
+ )
2245
+ else:
2246
+ # TODO(mgoin): remove this once we have hybrid memory allocator
2247
+ # Optimization for models with local attention (Llama 4)
2248
+ local_descs_list = []
2249
+ remote_descs_list = []
2250
+ for layer_idx, block_window in enumerate(self.block_window_per_layer):
2251
+ # For each layer:
2252
+ if block_window is None:
2253
+ # If not chunked, we just use the
2254
+ # full block lists (global attention)
2255
+ layer_local_block_ids = local_block_ids
2256
+ layer_remote_block_ids = remote_block_ids
2257
+ else:
2258
+ # If chunked, get the last block_window blocks
2259
+ layer_local_block_ids = local_block_ids[-block_window:]
2260
+ layer_remote_block_ids = remote_block_ids[-block_window:]
2261
+
2262
+ # Get descs ids for the layer.
2263
+ layer_local_desc_ids = self._get_block_descs_ids(
2264
+ dst_engine_id,
2265
+ layer_local_block_ids,
2266
+ layer_idx,
2267
+ )
2268
+ layer_remote_desc_ids = self._get_block_descs_ids(
2269
+ self.engine_id,
2270
+ layer_remote_block_ids,
2271
+ layer_idx,
2272
+ block_size_ratio=block_size_ratio,
2273
+ )
2274
+
2275
+ local_descs_list.append(layer_local_desc_ids)
2276
+ remote_descs_list.append(layer_remote_desc_ids)
2277
+
2278
+ local_block_descs_ids = np.concatenate(local_descs_list)
2279
+ remote_block_descs_ids = np.concatenate(remote_descs_list)
2280
+
2281
+ assert len(local_block_descs_ids) == len(remote_block_descs_ids)
2282
+
2283
+ # Prepare transfer with Nixl.
2284
+ handle = None
2285
+ try:
2286
+ handle = self.nixl_wrapper.make_prepped_xfer(
2287
+ "READ",
2288
+ local_xfer_side_handle,
2289
+ local_block_descs_ids,
2290
+ remote_xfer_side_handle,
2291
+ remote_block_descs_ids,
2292
+ notif_msg=notif_id,
2293
+ )
2294
+
2295
+ # Begin async xfer.
2296
+ self.nixl_wrapper.transfer(handle)
2297
+
2298
+ # Use handle to check completion in future step().
2299
+ self._recving_transfers[request_id].append(handle)
2300
+ except Exception as e:
2301
+ # mark all (logical) blocks for this request as invalid
2302
+ self._log_failure(
2303
+ failure_type="transfer_setup_failed",
2304
+ req_id=request_id,
2305
+ msg="Marking blocks as invalid",
2306
+ error=e,
2307
+ dst_engine_id=dst_engine_id,
2308
+ remote_rank=remote_rank,
2309
+ )
2310
+ if meta := self._recving_metadata.get(request_id):
2311
+ self._invalid_block_ids.update(meta.local_block_ids)
2312
+ self.xfer_stats.record_failed_transfer()
2313
+ if handle is not None:
2314
+ self.nixl_wrapper.release_xfer_handle(handle)
2315
+ self._failed_recv_reqs.add(request_id)
2316
+
2317
+ def get_mapped_blocks(self, block_ids, block_size_ratio):
2318
+ """
2319
+ Calculates the new set of block IDs by mapping every element
2320
+ in the (potentially sparse) input array.
2321
+ Example: block_ids=[0, 2], block_size_ratio=2
2322
+ get_mapped_blocks 0 1 [2 3] 4 5
2323
+ # remote is |h0-b0|h1-b0||h0-b1|h1-b1||h0-b1|h1-b1||
2324
+ # local is |h0-b0......||h1-b0......||h2-b0........
2325
+ local_block_ids 0 [1] 2
2326
+ """
2327
+ if block_ids.size == 0:
2328
+ return np.array([], dtype=np.int64)
2329
+
2330
+ start_ids = block_ids * block_size_ratio
2331
+ offsets = np.arange(block_size_ratio)
2332
+ mapped_2d = start_ids[:, None] + offsets[None, :]
2333
+
2334
+ return mapped_2d.flatten().astype(np.int64)
2335
+
2336
+ def _get_block_descs_ids(
2337
+ self,
2338
+ engine_id: str,
2339
+ block_ids: list[int],
2340
+ layer_idx: int | None = None,
2341
+ block_size_ratio: float | None = None,
2342
+ ) -> np.ndarray:
2343
+ """
2344
+ Get the descs ids for a set of block ids.
2345
+ If layer_idx is provided, we use the region_ids for the given layer.
2346
+ Otherwise, we use all regions.
2347
+ """
2348
+ if layer_idx is None:
2349
+ region_ids = np.arange(self.num_regions)
2350
+ else:
2351
+ assert layer_idx < self.num_layers
2352
+ if self.num_layers < self.num_regions:
2353
+ # If we have more regions than layers, we assume that
2354
+ # the regions are organized as [K0, V0, K1, V1, ...]
2355
+ # and we select K_i and V_i
2356
+ assert 2 * self.num_layers == self.num_regions
2357
+ region_ids = np.arange(2 * layer_idx, 2 * layer_idx + 2)
2358
+ else:
2359
+ # Otherwise, we assume we have MLA and select i-th layer
2360
+ assert self.num_layers == self.num_regions
2361
+ region_ids = np.arange(layer_idx, layer_idx + 1)
2362
+
2363
+ num_blocks = self.dst_num_blocks[engine_id]
2364
+ if block_size_ratio is not None:
2365
+ num_blocks = int(num_blocks * block_size_ratio)
2366
+
2367
+ # Compute the desc ids for each block.
2368
+ region_ids = region_ids[:, None]
2369
+ block_ids = np.array(block_ids)[None, :]
2370
+ descs_ids = region_ids * num_blocks + block_ids
2371
+ return descs_ids.flatten()
2372
+
2373
+ def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]:
2374
+ """
2375
+ Convert logical block ids to kernel physical block ids.
2376
+ This is required when the logical block size (the one set by the user)
2377
+ does not match the one required by the attn backend.
2378
+ """
2379
+ if self._physical_blocks_per_logical_kv_block == 1:
2380
+ # Noop when physical and logical block sizes are the same
2381
+ return block_ids
2382
+ block_ids_np = np.array(block_ids)
2383
+ block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape(
2384
+ 1, -1
2385
+ )
2386
+ return BlockTable.map_to_kernel_blocks(
2387
+ block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
2388
+ ).tolist()
2389
+
2390
+ def get_backend_aware_kv_block_len(self, layer_idx: int) -> int:
2391
+ """
2392
+ Get the block length for one K/V element (K and V have the same size).
2393
+
2394
+ For FA and other backends, this is equal to the length of the whole
2395
+ block, as K and V are in separate regions.
2396
+ For FlashInfer, this is half the length of the whole block, as K and V
2397
+ share the same region.
2398
+ """
2399
+ if self.kv_topo.is_kv_layout_blocks_first:
2400
+ # For indexing only half (either just the K or V part).
2401
+ block_len = self.block_len_per_layer[layer_idx] // 2
2402
+ else:
2403
+ block_len = self.block_len_per_layer[layer_idx]
2404
+ return block_len
2405
+
2406
+ def get_kv_connector_stats(self) -> KVConnectorStats | None:
2407
+ """
2408
+ Get the KV transfer stats for the connector.
2409
+ """
2410
+ # Clear stats for next iteration
2411
+ if not self.xfer_stats.is_empty():
2412
+ return self.xfer_stats.clone_and_reset()
2413
+ return None
2414
+
2415
+ def get_block_ids_with_load_errors(self) -> set[int]:
2416
+ """
2417
+ Return and clear the set of block IDs that failed to load.
2418
+
2419
+ This is called by the scheduler to identify blocks that need
2420
+ to be retried after a NIXL transfer failure.
2421
+ """
2422
+ result = self._invalid_block_ids
2423
+ self._invalid_block_ids = set()
2424
+ return result
2425
+
2426
+ def __del__(self):
2427
+ self.shutdown()
2428
+
2429
+ def shutdown(self):
2430
+ """Shutdown the connector worker."""
2431
+ self._handshake_initiation_executor.shutdown(wait=False)
2432
+ for handles in self._recving_transfers.values():
2433
+ for handle in handles:
2434
+ self.nixl_wrapper.release_xfer_handle(handle)
2435
+ self._recving_transfers.clear()
2436
+ for handle in self.src_xfer_handles_by_block_size.values():
2437
+ self.nixl_wrapper.release_dlist_handle(handle)
2438
+ self.src_xfer_handles_by_block_size.clear()
2439
+ for handles in self.src_xfer_handles_by_tp_ratio.values():
2440
+ for handle in handles:
2441
+ self.nixl_wrapper.release_dlist_handle(handle)
2442
+ self.src_xfer_handles_by_tp_ratio.clear()
2443
+ for dst_xfer_side_handles in self.dst_xfer_side_handles.values():
2444
+ for dst_xfer_side_handle in dst_xfer_side_handles.values():
2445
+ self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
2446
+ self.dst_xfer_side_handles.clear()
2447
+ for remote_agents in self._remote_agents.values():
2448
+ for agent_name in remote_agents.values():
2449
+ self.nixl_wrapper.remove_remote_agent(agent_name)
2450
+ self._remote_agents.clear()
2451
+ for desc in self._registered_descs:
2452
+ self.nixl_wrapper.deregister_memory(desc)
2453
+ self._registered_descs.clear()
2454
+
2455
+
2456
+ @contextlib.contextmanager
2457
+ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
2458
+ """Context manager for a ZMQ socket"""
2459
+
2460
+ if socket_type not in (zmq.ROUTER, zmq.REQ):
2461
+ raise ValueError(f"Unexpected socket type: {socket_type}")
2462
+
2463
+ ctx: zmq.Context | None = None
2464
+ try:
2465
+ ctx = zmq.Context() # type: ignore[attr-defined]
2466
+ yield make_zmq_socket(
2467
+ ctx=ctx, path=addr, socket_type=socket_type, bind=socket_type == zmq.ROUTER
2468
+ )
2469
+ finally:
2470
+ if ctx is not None:
2471
+ ctx.destroy(linger=0)
2472
+
2473
+
2474
+ @dataclass
2475
+ class NixlKVConnectorStats(KVConnectorStats):
2476
+ """Container for transfer performance metrics"""
2477
+
2478
+ def __post_init__(self):
2479
+ if not self.data:
2480
+ # Empty container init, no data is passed in.
2481
+ self.reset()
2482
+
2483
+ def reset(self):
2484
+ # Must be serializable
2485
+ self.data: dict[str, list[float]] = {
2486
+ "transfer_duration": [],
2487
+ "post_duration": [],
2488
+ "bytes_transferred": [],
2489
+ "num_descriptors": [],
2490
+ "num_failed_transfers": [],
2491
+ "num_failed_notifications": [],
2492
+ }
2493
+
2494
+ def record_transfer(self, res: nixlXferTelemetry):
2495
+ # Keep metrics units consistent with rest of the code: time us->s
2496
+ self.data["transfer_duration"].append(res.xferDuration / 1e6)
2497
+ self.data["post_duration"].append(res.postDuration / 1e6)
2498
+ self.data["bytes_transferred"].append(res.totalBytes)
2499
+ self.data["num_descriptors"].append(res.descCount)
2500
+
2501
+ def record_failed_transfer(self):
2502
+ """Record a failed NIXL transfer operation."""
2503
+ self.data["num_failed_transfers"].append(1.0)
2504
+
2505
+ def record_failed_notification(self):
2506
+ """Record a failed NIXL notification (send_notif)."""
2507
+ self.data["num_failed_notifications"].append(1.0)
2508
+
2509
+ def clone_and_reset(self) -> "NixlKVConnectorStats":
2510
+ old = copy.copy(self)
2511
+ self.reset()
2512
+ return old
2513
+
2514
+ def is_empty(self) -> bool:
2515
+ return self.num_successful_transfers == 0
2516
+
2517
+ def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
2518
+ if not other.is_empty():
2519
+ for k, v in other.data.items():
2520
+ accumulator = self.data[k]
2521
+ assert isinstance(accumulator, list)
2522
+ accumulator.extend(v)
2523
+ return self
2524
+
2525
+ def reduce(self) -> dict[str, int | float]:
2526
+ # Compute compact representative stats suitable for CLI logging
2527
+ if self.is_empty():
2528
+ return {
2529
+ "Num successful transfers": 0,
2530
+ "Avg xfer time (ms)": 0,
2531
+ "P90 xfer time (ms)": 0,
2532
+ "Avg post time (ms)": 0,
2533
+ "P90 post time (ms)": 0,
2534
+ "Avg MB per transfer": 0,
2535
+ "Throughput (MB/s)": 0,
2536
+ "Avg number of descriptors": 0,
2537
+ }
2538
+
2539
+ xfer_time = np.asarray(self.data["transfer_duration"])
2540
+ post_time = np.asarray(self.data["post_duration"])
2541
+ # Convert to MB for CLI logging.
2542
+ mb = np.asarray(self.data["bytes_transferred"]) / 2**20
2543
+ descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32)
2544
+ n = len(descs)
2545
+ assert n == self.num_successful_transfers
2546
+
2547
+ total_mb = mb.sum()
2548
+ avg_mb = total_mb / n
2549
+
2550
+ total_time_seconds = xfer_time.sum()
2551
+ throughput_mb_s = total_mb / total_time_seconds
2552
+
2553
+ return {
2554
+ "Num successful transfers": n,
2555
+ "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
2556
+ "P90 xfer time (ms)": round(np.percentile(xfer_time, 90).item() * 1e3, 3),
2557
+ "Avg post time (ms)": round(post_time.mean() * 1e3, 3),
2558
+ "P90 post time (ms)": round(np.percentile(post_time, 90).item() * 1e3, 3),
2559
+ "Avg MB per transfer": round(avg_mb, 3),
2560
+ "Throughput (MB/s)": round(throughput_mb_s, 3),
2561
+ "Avg number of descriptors": round(descs.mean(), 1),
2562
+ }
2563
+
2564
+ @property
2565
+ def num_successful_transfers(self) -> int:
2566
+ return len(self.data["transfer_duration"])
2567
+
2568
+
2569
+ class NixlPromMetrics(KVConnectorPromMetrics):
2570
+ def __init__(
2571
+ self,
2572
+ vllm_config: VllmConfig,
2573
+ metric_types: dict[type[PromMetric], type[PromMetricT]],
2574
+ labelnames: list[str],
2575
+ per_engine_labelvalues: dict[int, list[object]],
2576
+ ):
2577
+ super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
2578
+
2579
+ buckets = [
2580
+ 0.001,
2581
+ 0.005,
2582
+ 0.01,
2583
+ 0.025,
2584
+ 0.05,
2585
+ 0.075,
2586
+ 0.1,
2587
+ 0.2,
2588
+ 0.3,
2589
+ 0.5,
2590
+ 0.75,
2591
+ 1.0,
2592
+ 5.0,
2593
+ ]
2594
+ nixl_histogram_xfer_time = self._histogram_cls(
2595
+ name="vllm:nixl_xfer_time_seconds",
2596
+ documentation="Histogram of transfer duration for NIXL KV Cache transfers.",
2597
+ buckets=buckets[1:],
2598
+ labelnames=labelnames,
2599
+ )
2600
+ self.nixl_histogram_xfer_time = self.make_per_engine(nixl_histogram_xfer_time)
2601
+ nixl_histogram_post_time = self._histogram_cls(
2602
+ name="vllm:nixl_post_time_seconds",
2603
+ documentation="Histogram of transfer post time for NIXL KV"
2604
+ " Cache transfers.",
2605
+ buckets=buckets,
2606
+ labelnames=labelnames,
2607
+ )
2608
+ self.nixl_histogram_post_time = self.make_per_engine(nixl_histogram_post_time)
2609
+ # uniform 2kb to 16gb range
2610
+ buckets = [2 ** (10 + i) for i in range(1, 25, 2)]
2611
+ nixl_histogram_bytes_transferred = self._histogram_cls(
2612
+ name="vllm:nixl_bytes_transferred",
2613
+ documentation="Histogram of bytes transferred per NIXL KV Cache transfers.",
2614
+ buckets=buckets,
2615
+ labelnames=labelnames,
2616
+ )
2617
+ self.nixl_histogram_bytes_transferred = self.make_per_engine(
2618
+ nixl_histogram_bytes_transferred
2619
+ )
2620
+ buckets = [
2621
+ 10,
2622
+ 20,
2623
+ 30,
2624
+ 50,
2625
+ 75,
2626
+ 100,
2627
+ 200,
2628
+ 400,
2629
+ 1000,
2630
+ 2000,
2631
+ 4000,
2632
+ 10000,
2633
+ 20000,
2634
+ 50000,
2635
+ ]
2636
+ nixl_histogram_num_descriptors = self._histogram_cls(
2637
+ name="vllm:nixl_num_descriptors",
2638
+ documentation="Histogram of number of descriptors per NIXL"
2639
+ " KV Cache transfers.",
2640
+ buckets=buckets,
2641
+ labelnames=labelnames,
2642
+ )
2643
+ self.nixl_histogram_num_descriptors = self.make_per_engine(
2644
+ nixl_histogram_num_descriptors
2645
+ )
2646
+ counter_nixl_num_failed_transfers = self._counter_cls(
2647
+ name="vllm:nixl_num_failed_transfers",
2648
+ documentation="Number of failed NIXL KV Cache transfers.",
2649
+ labelnames=labelnames,
2650
+ )
2651
+ self.counter_nixl_num_failed_transfers = self.make_per_engine(
2652
+ counter_nixl_num_failed_transfers
2653
+ )
2654
+ counter_nixl_num_failed_notifications = self._counter_cls(
2655
+ name="vllm:nixl_num_failed_notifications",
2656
+ documentation="Number of failed NIXL KV Cache notifications.",
2657
+ labelnames=labelnames,
2658
+ )
2659
+ self.counter_nixl_num_failed_notifications = self.make_per_engine(
2660
+ counter_nixl_num_failed_notifications
2661
+ )
2662
+
2663
+ def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
2664
+ for prom_obj, list_item_key in zip(
2665
+ [
2666
+ self.nixl_histogram_xfer_time,
2667
+ self.nixl_histogram_post_time,
2668
+ self.nixl_histogram_bytes_transferred,
2669
+ self.nixl_histogram_num_descriptors,
2670
+ ],
2671
+ [
2672
+ "transfer_duration",
2673
+ "post_duration",
2674
+ "bytes_transferred",
2675
+ "num_descriptors",
2676
+ ],
2677
+ ):
2678
+ for list_item in transfer_stats_data[list_item_key]:
2679
+ prom_obj[engine_idx].observe(list_item)
2680
+ for counter_obj, counter_item_key in zip(
2681
+ [
2682
+ self.counter_nixl_num_failed_transfers,
2683
+ self.counter_nixl_num_failed_notifications,
2684
+ ],
2685
+ ["num_failed_transfers", "num_failed_notifications"],
2686
+ ):
2687
+ for list_item in transfer_stats_data[counter_item_key]:
2688
+ counter_obj[engine_idx].inc(list_item)