vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1398) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2044 -0
  5. vllm/_ipex_ops.py +393 -0
  6. vllm/_version.py +34 -0
  7. vllm/assets/__init__.py +0 -0
  8. vllm/assets/audio.py +45 -0
  9. vllm/assets/base.py +41 -0
  10. vllm/assets/image.py +50 -0
  11. vllm/assets/video.py +145 -0
  12. vllm/attention/__init__.py +15 -0
  13. vllm/attention/backends/__init__.py +0 -0
  14. vllm/attention/backends/abstract.py +204 -0
  15. vllm/attention/backends/utils.py +33 -0
  16. vllm/attention/layer.py +645 -0
  17. vllm/attention/layers/__init__.py +0 -0
  18. vllm/attention/layers/chunked_local_attention.py +93 -0
  19. vllm/attention/layers/cross_attention.py +162 -0
  20. vllm/attention/layers/encoder_only_attention.py +86 -0
  21. vllm/attention/ops/__init__.py +0 -0
  22. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  23. vllm/attention/ops/common.py +345 -0
  24. vllm/attention/ops/flashmla.py +192 -0
  25. vllm/attention/ops/merge_attn_states.py +43 -0
  26. vllm/attention/ops/paged_attn.py +262 -0
  27. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  28. vllm/attention/ops/prefix_prefill.py +928 -0
  29. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  30. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  31. vllm/attention/ops/triton_decode_attention.py +691 -0
  32. vllm/attention/ops/triton_flash_attention.py +984 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +175 -0
  35. vllm/attention/ops/triton_unified_attention.py +894 -0
  36. vllm/attention/selector.py +245 -0
  37. vllm/attention/utils/__init__.py +0 -0
  38. vllm/attention/utils/fa_utils.py +85 -0
  39. vllm/attention/utils/kv_sharing_utils.py +33 -0
  40. vllm/beam_search.py +87 -0
  41. vllm/benchmarks/__init__.py +0 -0
  42. vllm/benchmarks/datasets.py +2723 -0
  43. vllm/benchmarks/latency.py +170 -0
  44. vllm/benchmarks/lib/__init__.py +3 -0
  45. vllm/benchmarks/lib/endpoint_request_func.py +533 -0
  46. vllm/benchmarks/lib/ready_checker.py +73 -0
  47. vllm/benchmarks/lib/utils.py +80 -0
  48. vllm/benchmarks/serve.py +1358 -0
  49. vllm/benchmarks/throughput.py +696 -0
  50. vllm/collect_env.py +823 -0
  51. vllm/compilation/__init__.py +0 -0
  52. vllm/compilation/activation_quant_fusion.py +189 -0
  53. vllm/compilation/backends.py +650 -0
  54. vllm/compilation/base_static_graph.py +56 -0
  55. vllm/compilation/collective_fusion.py +1188 -0
  56. vllm/compilation/compiler_interface.py +573 -0
  57. vllm/compilation/counter.py +47 -0
  58. vllm/compilation/cuda_graph.py +199 -0
  59. vllm/compilation/cuda_piecewise_backend.py +117 -0
  60. vllm/compilation/decorators.py +400 -0
  61. vllm/compilation/fix_functionalization.py +205 -0
  62. vllm/compilation/fusion.py +383 -0
  63. vllm/compilation/fusion_attn.py +295 -0
  64. vllm/compilation/fx_utils.py +84 -0
  65. vllm/compilation/inductor_pass.py +136 -0
  66. vllm/compilation/monitor.py +57 -0
  67. vllm/compilation/noop_elimination.py +158 -0
  68. vllm/compilation/pass_manager.py +125 -0
  69. vllm/compilation/post_cleanup.py +20 -0
  70. vllm/compilation/sequence_parallelism.py +478 -0
  71. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  72. vllm/compilation/vllm_inductor_pass.py +156 -0
  73. vllm/compilation/wrapper.py +136 -0
  74. vllm/config/__init__.py +814 -0
  75. vllm/config/cache.py +220 -0
  76. vllm/config/compilation.py +673 -0
  77. vllm/config/device.py +74 -0
  78. vllm/config/kv_events.py +50 -0
  79. vllm/config/kv_transfer.py +111 -0
  80. vllm/config/load.py +113 -0
  81. vllm/config/lora.py +132 -0
  82. vllm/config/model.py +1912 -0
  83. vllm/config/multimodal.py +129 -0
  84. vllm/config/observability.py +99 -0
  85. vllm/config/parallel.py +524 -0
  86. vllm/config/pooler.py +97 -0
  87. vllm/config/scheduler.py +287 -0
  88. vllm/config/speculative.py +568 -0
  89. vllm/config/speech_to_text.py +39 -0
  90. vllm/config/structured_outputs.py +64 -0
  91. vllm/config/utils.py +145 -0
  92. vllm/connections.py +186 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +311 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +41 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +440 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +317 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +295 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +323 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +28 -0
  106. vllm/distributed/device_communicators/pynccl.py +340 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +186 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +416 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +589 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +635 -0
  113. vllm/distributed/device_communicators/symm_mem.py +136 -0
  114. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  115. vllm/distributed/device_communicators/xpu_communicator.py +94 -0
  116. vllm/distributed/eplb/__init__.py +8 -0
  117. vllm/distributed/eplb/eplb_state.py +620 -0
  118. vllm/distributed/eplb/rebalance_algo.py +239 -0
  119. vllm/distributed/eplb/rebalance_execute.py +424 -0
  120. vllm/distributed/kv_events.py +362 -0
  121. vllm/distributed/kv_transfer/README.md +29 -0
  122. vllm/distributed/kv_transfer/__init__.py +13 -0
  123. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  124. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  125. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  126. vllm/distributed/kv_transfer/kv_connector/factory.py +113 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +261 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +388 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +168 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +100 -0
  132. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +328 -0
  133. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1473 -0
  134. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +485 -0
  135. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +488 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +550 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +267 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +418 -0
  140. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  141. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  142. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  144. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  145. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  146. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  147. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  148. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  149. vllm/distributed/parallel_state.py +1532 -0
  150. vllm/distributed/tpu_distributed_utils.py +178 -0
  151. vllm/distributed/utils.py +536 -0
  152. vllm/engine/__init__.py +0 -0
  153. vllm/engine/arg_utils.py +1778 -0
  154. vllm/engine/async_llm_engine.py +6 -0
  155. vllm/engine/llm_engine.py +6 -0
  156. vllm/engine/metrics.py +577 -0
  157. vllm/engine/metrics_types.py +84 -0
  158. vllm/engine/protocol.py +333 -0
  159. vllm/entrypoints/__init__.py +0 -0
  160. vllm/entrypoints/api_server.py +178 -0
  161. vllm/entrypoints/chat_utils.py +1705 -0
  162. vllm/entrypoints/cli/__init__.py +12 -0
  163. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  164. vllm/entrypoints/cli/benchmark/base.py +25 -0
  165. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  166. vllm/entrypoints/cli/benchmark/main.py +55 -0
  167. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  168. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  169. vllm/entrypoints/cli/collect_env.py +36 -0
  170. vllm/entrypoints/cli/main.py +60 -0
  171. vllm/entrypoints/cli/openai.py +233 -0
  172. vllm/entrypoints/cli/run_batch.py +67 -0
  173. vllm/entrypoints/cli/serve.py +232 -0
  174. vllm/entrypoints/cli/types.py +29 -0
  175. vllm/entrypoints/constants.py +10 -0
  176. vllm/entrypoints/context.py +481 -0
  177. vllm/entrypoints/harmony_utils.py +436 -0
  178. vllm/entrypoints/launcher.py +164 -0
  179. vllm/entrypoints/llm.py +1629 -0
  180. vllm/entrypoints/logger.py +79 -0
  181. vllm/entrypoints/openai/__init__.py +0 -0
  182. vllm/entrypoints/openai/api_server.py +1953 -0
  183. vllm/entrypoints/openai/cli_args.py +288 -0
  184. vllm/entrypoints/openai/logits_processors.py +90 -0
  185. vllm/entrypoints/openai/protocol.py +2757 -0
  186. vllm/entrypoints/openai/run_batch.py +491 -0
  187. vllm/entrypoints/openai/serving_chat.py +1597 -0
  188. vllm/entrypoints/openai/serving_classification.py +173 -0
  189. vllm/entrypoints/openai/serving_completion.py +692 -0
  190. vllm/entrypoints/openai/serving_embedding.py +631 -0
  191. vllm/entrypoints/openai/serving_engine.py +992 -0
  192. vllm/entrypoints/openai/serving_models.py +288 -0
  193. vllm/entrypoints/openai/serving_pooling.py +276 -0
  194. vllm/entrypoints/openai/serving_responses.py +1709 -0
  195. vllm/entrypoints/openai/serving_score.py +479 -0
  196. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  197. vllm/entrypoints/openai/serving_transcription.py +136 -0
  198. vllm/entrypoints/openai/speech_to_text.py +388 -0
  199. vllm/entrypoints/openai/tool_parsers/__init__.py +55 -0
  200. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  201. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  202. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  203. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  204. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  205. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  206. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +455 -0
  207. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  208. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  209. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  210. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  211. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  212. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  213. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +39 -0
  214. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  216. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +93 -0
  217. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  218. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  219. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  220. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1137 -0
  221. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  222. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  223. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  224. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  225. vllm/entrypoints/renderer.py +395 -0
  226. vllm/entrypoints/score_utils.py +232 -0
  227. vllm/entrypoints/ssl.py +75 -0
  228. vllm/entrypoints/tool.py +139 -0
  229. vllm/entrypoints/tool_server.py +206 -0
  230. vllm/entrypoints/utils.py +233 -0
  231. vllm/env_override.py +23 -0
  232. vllm/envs.py +1590 -0
  233. vllm/executor/__init__.py +0 -0
  234. vllm/executor/executor_base.py +381 -0
  235. vllm/executor/msgspec_utils.py +35 -0
  236. vllm/executor/ray_distributed_executor.py +699 -0
  237. vllm/executor/ray_utils.py +410 -0
  238. vllm/executor/uniproc_executor.py +176 -0
  239. vllm/forward_context.py +402 -0
  240. vllm/inputs/__init__.py +30 -0
  241. vllm/inputs/data.py +356 -0
  242. vllm/inputs/parse.py +151 -0
  243. vllm/inputs/preprocess.py +664 -0
  244. vllm/logger.py +229 -0
  245. vllm/logging_utils/__init__.py +10 -0
  246. vllm/logging_utils/dump_input.py +81 -0
  247. vllm/logging_utils/formatter.py +79 -0
  248. vllm/logging_utils/log_time.py +32 -0
  249. vllm/logits_process.py +119 -0
  250. vllm/logprobs.py +28 -0
  251. vllm/lora/__init__.py +0 -0
  252. vllm/lora/layers/__init__.py +34 -0
  253. vllm/lora/layers/base.py +69 -0
  254. vllm/lora/layers/base_linear.py +185 -0
  255. vllm/lora/layers/column_parallel_linear.py +609 -0
  256. vllm/lora/layers/logits_processor.py +247 -0
  257. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  258. vllm/lora/layers/replicated_linear.py +60 -0
  259. vllm/lora/layers/row_parallel_linear.py +196 -0
  260. vllm/lora/layers/utils.py +65 -0
  261. vllm/lora/layers/vocal_parallel_embedding.py +174 -0
  262. vllm/lora/lora_weights.py +199 -0
  263. vllm/lora/models.py +816 -0
  264. vllm/lora/ops/__init__.py +0 -0
  265. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  266. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  267. vllm/lora/ops/torch_ops/__init__.py +16 -0
  268. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  269. vllm/lora/ops/triton_ops/__init__.py +12 -0
  270. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  271. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  272. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  273. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  274. vllm/lora/ops/triton_ops/utils.py +126 -0
  275. vllm/lora/ops/xla_ops/__init__.py +7 -0
  276. vllm/lora/ops/xla_ops/lora_ops.py +144 -0
  277. vllm/lora/peft_helper.py +127 -0
  278. vllm/lora/punica_wrapper/__init__.py +10 -0
  279. vllm/lora/punica_wrapper/punica_base.py +458 -0
  280. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  281. vllm/lora/punica_wrapper/punica_gpu.py +272 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  284. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  285. vllm/lora/punica_wrapper/utils.py +136 -0
  286. vllm/lora/request.py +97 -0
  287. vllm/lora/resolver.py +85 -0
  288. vllm/lora/utils.py +246 -0
  289. vllm/lora/worker_manager.py +267 -0
  290. vllm/model_executor/__init__.py +12 -0
  291. vllm/model_executor/custom_op.py +194 -0
  292. vllm/model_executor/layers/__init__.py +0 -0
  293. vllm/model_executor/layers/activation.py +575 -0
  294. vllm/model_executor/layers/attention_layer_base.py +23 -0
  295. vllm/model_executor/layers/fla/__init__.py +8 -0
  296. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  297. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  298. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  299. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  300. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  301. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  302. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  303. vllm/model_executor/layers/fla/ops/index.py +39 -0
  304. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  305. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  306. vllm/model_executor/layers/fla/ops/op.py +39 -0
  307. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  308. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  309. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  310. vllm/model_executor/layers/fused_moe/__init__.py +89 -0
  311. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +322 -0
  312. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +141 -0
  313. vllm/model_executor/layers/fused_moe/config.py +804 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  545. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +300 -0
  546. vllm/model_executor/layers/fused_moe/cutlass_moe.py +957 -0
  547. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +362 -0
  548. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  549. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +361 -0
  550. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +274 -0
  551. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +268 -0
  552. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +300 -0
  553. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +184 -0
  554. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +993 -0
  555. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +239 -0
  556. vllm/model_executor/layers/fused_moe/fused_moe.py +1890 -0
  557. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +307 -0
  558. vllm/model_executor/layers/fused_moe/layer.py +2195 -0
  559. vllm/model_executor/layers/fused_moe/modular_kernel.py +1038 -0
  560. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  561. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  562. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  563. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  564. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +341 -0
  565. vllm/model_executor/layers/fused_moe/prepare_finalize.py +70 -0
  566. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +424 -0
  567. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  568. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  569. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +143 -0
  570. vllm/model_executor/layers/fused_moe/trtllm_moe.py +191 -0
  571. vllm/model_executor/layers/fused_moe/utils.py +274 -0
  572. vllm/model_executor/layers/layernorm.py +395 -0
  573. vllm/model_executor/layers/lightning_attn.py +661 -0
  574. vllm/model_executor/layers/linear.py +1603 -0
  575. vllm/model_executor/layers/logits_processor.py +106 -0
  576. vllm/model_executor/layers/mamba/__init__.py +0 -0
  577. vllm/model_executor/layers/mamba/abstract.py +42 -0
  578. vllm/model_executor/layers/mamba/linear_attn.py +403 -0
  579. vllm/model_executor/layers/mamba/mamba_mixer.py +466 -0
  580. vllm/model_executor/layers/mamba/mamba_mixer2.py +764 -0
  581. vllm/model_executor/layers/mamba/mamba_utils.py +186 -0
  582. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  583. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1092 -0
  584. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  585. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  586. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +242 -0
  587. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +527 -0
  588. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +724 -0
  589. vllm/model_executor/layers/mamba/ops/ssd_combined.py +238 -0
  590. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +200 -0
  591. vllm/model_executor/layers/mamba/short_conv.py +253 -0
  592. vllm/model_executor/layers/mla.py +173 -0
  593. vllm/model_executor/layers/pooler.py +719 -0
  594. vllm/model_executor/layers/quantization/__init__.py +157 -0
  595. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  596. vllm/model_executor/layers/quantization/awq.py +228 -0
  597. vllm/model_executor/layers/quantization/awq_marlin.py +554 -0
  598. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  599. vllm/model_executor/layers/quantization/base_config.py +170 -0
  600. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  601. vllm/model_executor/layers/quantization/bitsandbytes.py +627 -0
  602. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  603. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +797 -0
  604. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2074 -0
  605. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  606. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  607. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  608. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  609. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  610. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +185 -0
  611. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  612. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  613. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  614. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +157 -0
  615. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  616. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +238 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +153 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +46 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  625. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  626. vllm/model_executor/layers/quantization/experts_int8.py +223 -0
  627. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  628. vllm/model_executor/layers/quantization/fp8.py +1098 -0
  629. vllm/model_executor/layers/quantization/gguf.py +599 -0
  630. vllm/model_executor/layers/quantization/gptq.py +340 -0
  631. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  632. vllm/model_executor/layers/quantization/gptq_marlin.py +751 -0
  633. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  634. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  635. vllm/model_executor/layers/quantization/inc.py +61 -0
  636. vllm/model_executor/layers/quantization/input_quant_fp8.py +156 -0
  637. vllm/model_executor/layers/quantization/ipex_quant.py +415 -0
  638. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  639. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  640. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  641. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  642. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  643. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  644. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  645. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  646. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  647. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  648. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  649. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  650. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  651. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +161 -0
  652. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  653. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  654. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  655. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  656. vllm/model_executor/layers/quantization/kv_cache.py +143 -0
  657. vllm/model_executor/layers/quantization/modelopt.py +1596 -0
  658. vllm/model_executor/layers/quantization/moe_wna16.py +484 -0
  659. vllm/model_executor/layers/quantization/mxfp4.py +988 -0
  660. vllm/model_executor/layers/quantization/petit.py +306 -0
  661. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  662. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  663. vllm/model_executor/layers/quantization/quark/quark.py +432 -0
  664. vllm/model_executor/layers/quantization/quark/quark_moe.py +561 -0
  665. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  666. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  667. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +239 -0
  668. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  669. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  670. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  671. vllm/model_executor/layers/quantization/rtn.py +466 -0
  672. vllm/model_executor/layers/quantization/schema.py +86 -0
  673. vllm/model_executor/layers/quantization/torchao.py +214 -0
  674. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  675. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  676. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  677. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  888. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  889. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +79 -0
  890. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +248 -0
  891. vllm/model_executor/layers/quantization/utils/fp8_utils.py +949 -0
  892. vllm/model_executor/layers/quantization/utils/gptq_utils.py +146 -0
  893. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  894. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  895. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  896. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  897. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  898. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  899. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  900. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  901. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +141 -0
  902. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  903. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  904. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  905. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  906. vllm/model_executor/layers/quantization/utils/quant_utils.py +641 -0
  907. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  908. vllm/model_executor/layers/resampler.py +270 -0
  909. vllm/model_executor/layers/rotary_embedding/__init__.py +204 -0
  910. vllm/model_executor/layers/rotary_embedding/base.py +177 -0
  911. vllm/model_executor/layers/rotary_embedding/common.py +150 -0
  912. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +138 -0
  913. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  914. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  915. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  916. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  917. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  918. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  919. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  920. vllm/model_executor/layers/rotary_embedding/mrope.py +1321 -0
  921. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  922. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  923. vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py +86 -0
  924. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  925. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  926. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  927. vllm/model_executor/layers/utils.py +195 -0
  928. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  929. vllm/model_executor/model_loader/__init__.py +138 -0
  930. vllm/model_executor/model_loader/base_loader.py +52 -0
  931. vllm/model_executor/model_loader/bitsandbytes_loader.py +788 -0
  932. vllm/model_executor/model_loader/default_loader.py +277 -0
  933. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  934. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  935. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  936. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  937. vllm/model_executor/model_loader/tensorizer.py +738 -0
  938. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  939. vllm/model_executor/model_loader/tpu.py +114 -0
  940. vllm/model_executor/model_loader/utils.py +292 -0
  941. vllm/model_executor/model_loader/weight_utils.py +990 -0
  942. vllm/model_executor/models/__init__.py +33 -0
  943. vllm/model_executor/models/adapters.py +542 -0
  944. vllm/model_executor/models/aimv2.py +246 -0
  945. vllm/model_executor/models/apertus.py +579 -0
  946. vllm/model_executor/models/arcee.py +422 -0
  947. vllm/model_executor/models/arctic.py +558 -0
  948. vllm/model_executor/models/aria.py +650 -0
  949. vllm/model_executor/models/aya_vision.py +468 -0
  950. vllm/model_executor/models/baichuan.py +474 -0
  951. vllm/model_executor/models/bailing_moe.py +642 -0
  952. vllm/model_executor/models/bamba.py +514 -0
  953. vllm/model_executor/models/bert.py +665 -0
  954. vllm/model_executor/models/bert_with_rope.py +687 -0
  955. vllm/model_executor/models/blip.py +339 -0
  956. vllm/model_executor/models/blip2.py +712 -0
  957. vllm/model_executor/models/bloom.py +374 -0
  958. vllm/model_executor/models/chameleon.py +1139 -0
  959. vllm/model_executor/models/chatglm.py +476 -0
  960. vllm/model_executor/models/clip.py +407 -0
  961. vllm/model_executor/models/cohere2_vision.py +481 -0
  962. vllm/model_executor/models/commandr.py +465 -0
  963. vllm/model_executor/models/config.py +445 -0
  964. vllm/model_executor/models/dbrx.py +471 -0
  965. vllm/model_executor/models/deepseek.py +497 -0
  966. vllm/model_executor/models/deepseek_eagle.py +240 -0
  967. vllm/model_executor/models/deepseek_mtp.py +289 -0
  968. vllm/model_executor/models/deepseek_v2.py +1444 -0
  969. vllm/model_executor/models/deepseek_vl2.py +658 -0
  970. vllm/model_executor/models/dots1.py +546 -0
  971. vllm/model_executor/models/dots_ocr.py +873 -0
  972. vllm/model_executor/models/ernie45.py +43 -0
  973. vllm/model_executor/models/ernie45_moe.py +607 -0
  974. vllm/model_executor/models/ernie45_vl.py +1527 -0
  975. vllm/model_executor/models/ernie45_vl_moe.py +727 -0
  976. vllm/model_executor/models/ernie_mtp.py +268 -0
  977. vllm/model_executor/models/exaone.py +550 -0
  978. vllm/model_executor/models/exaone4.py +533 -0
  979. vllm/model_executor/models/fairseq2_llama.py +154 -0
  980. vllm/model_executor/models/falcon.py +509 -0
  981. vllm/model_executor/models/falcon_h1.py +674 -0
  982. vllm/model_executor/models/fuyu.py +399 -0
  983. vllm/model_executor/models/gemma.py +425 -0
  984. vllm/model_executor/models/gemma2.py +422 -0
  985. vllm/model_executor/models/gemma3.py +555 -0
  986. vllm/model_executor/models/gemma3_mm.py +721 -0
  987. vllm/model_executor/models/gemma3n.py +1113 -0
  988. vllm/model_executor/models/gemma3n_mm.py +761 -0
  989. vllm/model_executor/models/glm.py +23 -0
  990. vllm/model_executor/models/glm4.py +304 -0
  991. vllm/model_executor/models/glm4_1v.py +1690 -0
  992. vllm/model_executor/models/glm4_moe.py +727 -0
  993. vllm/model_executor/models/glm4_moe_mtp.py +301 -0
  994. vllm/model_executor/models/glm4v.py +654 -0
  995. vllm/model_executor/models/gpt2.py +380 -0
  996. vllm/model_executor/models/gpt_bigcode.py +344 -0
  997. vllm/model_executor/models/gpt_j.py +339 -0
  998. vllm/model_executor/models/gpt_neox.py +330 -0
  999. vllm/model_executor/models/gpt_oss.py +712 -0
  1000. vllm/model_executor/models/granite.py +489 -0
  1001. vllm/model_executor/models/granite_speech.py +794 -0
  1002. vllm/model_executor/models/granitemoe.py +550 -0
  1003. vllm/model_executor/models/granitemoehybrid.py +614 -0
  1004. vllm/model_executor/models/granitemoeshared.py +332 -0
  1005. vllm/model_executor/models/gritlm.py +262 -0
  1006. vllm/model_executor/models/grok1.py +547 -0
  1007. vllm/model_executor/models/h2ovl.py +536 -0
  1008. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1009. vllm/model_executor/models/hyperclovax_vision.py +1192 -0
  1010. vllm/model_executor/models/idefics2_vision_model.py +417 -0
  1011. vllm/model_executor/models/idefics3.py +756 -0
  1012. vllm/model_executor/models/interfaces.py +959 -0
  1013. vllm/model_executor/models/interfaces_base.py +192 -0
  1014. vllm/model_executor/models/intern_vit.py +441 -0
  1015. vllm/model_executor/models/internlm2.py +450 -0
  1016. vllm/model_executor/models/internlm2_ve.py +148 -0
  1017. vllm/model_executor/models/interns1.py +838 -0
  1018. vllm/model_executor/models/interns1_vit.py +418 -0
  1019. vllm/model_executor/models/internvl.py +1423 -0
  1020. vllm/model_executor/models/jais.py +373 -0
  1021. vllm/model_executor/models/jamba.py +591 -0
  1022. vllm/model_executor/models/jina_vl.py +144 -0
  1023. vllm/model_executor/models/keye.py +1680 -0
  1024. vllm/model_executor/models/keye_vl1_5.py +602 -0
  1025. vllm/model_executor/models/kimi_vl.py +618 -0
  1026. vllm/model_executor/models/lfm2.py +548 -0
  1027. vllm/model_executor/models/llama.py +669 -0
  1028. vllm/model_executor/models/llama4.py +746 -0
  1029. vllm/model_executor/models/llama4_eagle.py +239 -0
  1030. vllm/model_executor/models/llama_eagle.py +179 -0
  1031. vllm/model_executor/models/llama_eagle3.py +296 -0
  1032. vllm/model_executor/models/llava.py +870 -0
  1033. vllm/model_executor/models/llava_next.py +571 -0
  1034. vllm/model_executor/models/llava_next_video.py +476 -0
  1035. vllm/model_executor/models/llava_onevision.py +942 -0
  1036. vllm/model_executor/models/longcat_flash.py +715 -0
  1037. vllm/model_executor/models/longcat_flash_mtp.py +352 -0
  1038. vllm/model_executor/models/mamba.py +275 -0
  1039. vllm/model_executor/models/mamba2.py +291 -0
  1040. vllm/model_executor/models/medusa.py +169 -0
  1041. vllm/model_executor/models/midashenglm.py +792 -0
  1042. vllm/model_executor/models/mimo.py +188 -0
  1043. vllm/model_executor/models/mimo_mtp.py +280 -0
  1044. vllm/model_executor/models/minicpm.py +631 -0
  1045. vllm/model_executor/models/minicpm3.py +230 -0
  1046. vllm/model_executor/models/minicpm_eagle.py +389 -0
  1047. vllm/model_executor/models/minicpmo.py +770 -0
  1048. vllm/model_executor/models/minicpmv.py +1784 -0
  1049. vllm/model_executor/models/minimax_text_01.py +986 -0
  1050. vllm/model_executor/models/minimax_vl_01.py +426 -0
  1051. vllm/model_executor/models/mistral3.py +628 -0
  1052. vllm/model_executor/models/mixtral.py +606 -0
  1053. vllm/model_executor/models/mllama4.py +1076 -0
  1054. vllm/model_executor/models/mlp_speculator.py +206 -0
  1055. vllm/model_executor/models/modernbert.py +374 -0
  1056. vllm/model_executor/models/module_mapping.py +72 -0
  1057. vllm/model_executor/models/molmo.py +1567 -0
  1058. vllm/model_executor/models/moonvit.py +673 -0
  1059. vllm/model_executor/models/motif.py +345 -0
  1060. vllm/model_executor/models/mpt.py +329 -0
  1061. vllm/model_executor/models/nano_nemotron_vl.py +1394 -0
  1062. vllm/model_executor/models/nemotron.py +507 -0
  1063. vllm/model_executor/models/nemotron_h.py +565 -0
  1064. vllm/model_executor/models/nemotron_nas.py +481 -0
  1065. vllm/model_executor/models/nemotron_vl.py +652 -0
  1066. vllm/model_executor/models/nvlm_d.py +203 -0
  1067. vllm/model_executor/models/olmo.py +404 -0
  1068. vllm/model_executor/models/olmo2.py +439 -0
  1069. vllm/model_executor/models/olmoe.py +483 -0
  1070. vllm/model_executor/models/opt.py +412 -0
  1071. vllm/model_executor/models/orion.py +348 -0
  1072. vllm/model_executor/models/ovis.py +559 -0
  1073. vllm/model_executor/models/ovis2_5.py +642 -0
  1074. vllm/model_executor/models/paligemma.py +411 -0
  1075. vllm/model_executor/models/persimmon.py +343 -0
  1076. vllm/model_executor/models/phi.py +356 -0
  1077. vllm/model_executor/models/phi3.py +19 -0
  1078. vllm/model_executor/models/phi3v.py +698 -0
  1079. vllm/model_executor/models/phi4_multimodal.py +1475 -0
  1080. vllm/model_executor/models/phi4mm.py +1279 -0
  1081. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1082. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1083. vllm/model_executor/models/phimoe.py +679 -0
  1084. vllm/model_executor/models/pixtral.py +1345 -0
  1085. vllm/model_executor/models/plamo2.py +978 -0
  1086. vllm/model_executor/models/qwen.py +361 -0
  1087. vllm/model_executor/models/qwen2.py +523 -0
  1088. vllm/model_executor/models/qwen2_5_omni_thinker.py +984 -0
  1089. vllm/model_executor/models/qwen2_5_vl.py +1481 -0
  1090. vllm/model_executor/models/qwen2_audio.py +489 -0
  1091. vllm/model_executor/models/qwen2_moe.py +558 -0
  1092. vllm/model_executor/models/qwen2_rm.py +122 -0
  1093. vllm/model_executor/models/qwen2_vl.py +1670 -0
  1094. vllm/model_executor/models/qwen3.py +341 -0
  1095. vllm/model_executor/models/qwen3_moe.py +692 -0
  1096. vllm/model_executor/models/qwen3_next.py +1266 -0
  1097. vllm/model_executor/models/qwen3_next_mtp.py +281 -0
  1098. vllm/model_executor/models/qwen3_vl.py +1613 -0
  1099. vllm/model_executor/models/qwen3_vl_moe.py +358 -0
  1100. vllm/model_executor/models/qwen_vl.py +795 -0
  1101. vllm/model_executor/models/radio.py +576 -0
  1102. vllm/model_executor/models/registry.py +990 -0
  1103. vllm/model_executor/models/roberta.py +252 -0
  1104. vllm/model_executor/models/rvl.py +103 -0
  1105. vllm/model_executor/models/seed_oss.py +485 -0
  1106. vllm/model_executor/models/siglip.py +540 -0
  1107. vllm/model_executor/models/siglip2navit.py +689 -0
  1108. vllm/model_executor/models/skyworkr1v.py +911 -0
  1109. vllm/model_executor/models/smolvlm.py +44 -0
  1110. vllm/model_executor/models/solar.py +504 -0
  1111. vllm/model_executor/models/stablelm.py +341 -0
  1112. vllm/model_executor/models/starcoder2.py +354 -0
  1113. vllm/model_executor/models/step3_text.py +510 -0
  1114. vllm/model_executor/models/step3_vl.py +1072 -0
  1115. vllm/model_executor/models/swin.py +475 -0
  1116. vllm/model_executor/models/tarsier.py +639 -0
  1117. vllm/model_executor/models/telechat2.py +151 -0
  1118. vllm/model_executor/models/teleflm.py +79 -0
  1119. vllm/model_executor/models/terratorch.py +294 -0
  1120. vllm/model_executor/models/transformers.py +948 -0
  1121. vllm/model_executor/models/ultravox.py +654 -0
  1122. vllm/model_executor/models/utils.py +808 -0
  1123. vllm/model_executor/models/vision.py +404 -0
  1124. vllm/model_executor/models/voxtral.py +786 -0
  1125. vllm/model_executor/models/whisper.py +963 -0
  1126. vllm/model_executor/models/zamba2.py +960 -0
  1127. vllm/model_executor/parameter.py +620 -0
  1128. vllm/model_executor/utils.py +86 -0
  1129. vllm/model_executor/warmup/__init__.py +0 -0
  1130. vllm/model_executor/warmup/deep_gemm_warmup.py +230 -0
  1131. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1132. vllm/multimodal/__init__.py +33 -0
  1133. vllm/multimodal/audio.py +116 -0
  1134. vllm/multimodal/base.py +27 -0
  1135. vllm/multimodal/cache.py +697 -0
  1136. vllm/multimodal/evs.py +273 -0
  1137. vllm/multimodal/hasher.py +102 -0
  1138. vllm/multimodal/image.py +130 -0
  1139. vllm/multimodal/inputs.py +987 -0
  1140. vllm/multimodal/parse.py +511 -0
  1141. vllm/multimodal/processing.py +2148 -0
  1142. vllm/multimodal/profiling.py +284 -0
  1143. vllm/multimodal/registry.py +345 -0
  1144. vllm/multimodal/utils.py +503 -0
  1145. vllm/multimodal/video.py +319 -0
  1146. vllm/outputs.py +324 -0
  1147. vllm/platforms/__init__.py +263 -0
  1148. vllm/platforms/cpu.py +340 -0
  1149. vllm/platforms/cuda.py +668 -0
  1150. vllm/platforms/interface.py +620 -0
  1151. vllm/platforms/rocm.py +497 -0
  1152. vllm/platforms/tpu.py +233 -0
  1153. vllm/platforms/xpu.py +243 -0
  1154. vllm/plugins/__init__.py +72 -0
  1155. vllm/plugins/io_processors/__init__.py +68 -0
  1156. vllm/plugins/io_processors/interface.py +67 -0
  1157. vllm/plugins/lora_resolvers/README.md +16 -0
  1158. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1159. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1160. vllm/pooling_params.py +191 -0
  1161. vllm/profiler/__init__.py +0 -0
  1162. vllm/profiler/layerwise_profile.py +375 -0
  1163. vllm/profiler/utils.py +148 -0
  1164. vllm/py.typed +2 -0
  1165. vllm/ray/__init__.py +0 -0
  1166. vllm/ray/lazy_utils.py +22 -0
  1167. vllm/ray/ray_env.py +72 -0
  1168. vllm/reasoning/__init__.py +29 -0
  1169. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1170. vllm/reasoning/basic_parsers.py +156 -0
  1171. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1172. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1173. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1174. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1175. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1176. vllm/reasoning/mistral_reasoning_parser.py +56 -0
  1177. vllm/reasoning/qwen3_reasoning_parser.py +72 -0
  1178. vllm/reasoning/seedoss_reasoning_parser.py +28 -0
  1179. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1180. vllm/sampling_params.py +593 -0
  1181. vllm/scalar_type.py +349 -0
  1182. vllm/scripts.py +15 -0
  1183. vllm/sequence.py +103 -0
  1184. vllm/tasks.py +11 -0
  1185. vllm/test_utils.py +129 -0
  1186. vllm/third_party/__init__.py +0 -0
  1187. vllm/third_party/pynvml.py +6140 -0
  1188. vllm/tracing.py +136 -0
  1189. vllm/transformers_utils/__init__.py +24 -0
  1190. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1191. vllm/transformers_utils/chat_templates/registry.py +70 -0
  1192. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1193. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1194. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1195. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1196. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1197. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1198. vllm/transformers_utils/config.py +1102 -0
  1199. vllm/transformers_utils/config_parser_base.py +20 -0
  1200. vllm/transformers_utils/configs/__init__.py +63 -0
  1201. vllm/transformers_utils/configs/arctic.py +207 -0
  1202. vllm/transformers_utils/configs/chatglm.py +72 -0
  1203. vllm/transformers_utils/configs/deepseek_v3.py +101 -0
  1204. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1205. vllm/transformers_utils/configs/dotsocr.py +69 -0
  1206. vllm/transformers_utils/configs/eagle.py +84 -0
  1207. vllm/transformers_utils/configs/falcon.py +90 -0
  1208. vllm/transformers_utils/configs/jais.py +237 -0
  1209. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1210. vllm/transformers_utils/configs/medusa.py +63 -0
  1211. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1212. vllm/transformers_utils/configs/mistral.py +165 -0
  1213. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1214. vllm/transformers_utils/configs/moonvit.py +33 -0
  1215. vllm/transformers_utils/configs/nemotron.py +205 -0
  1216. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1217. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1218. vllm/transformers_utils/configs/olmo3.py +80 -0
  1219. vllm/transformers_utils/configs/ovis.py +176 -0
  1220. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1221. vllm/transformers_utils/configs/radio.py +91 -0
  1222. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1223. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1224. vllm/transformers_utils/configs/speculators/base.py +111 -0
  1225. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1226. vllm/transformers_utils/configs/ultravox.py +116 -0
  1227. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1228. vllm/transformers_utils/dynamic_module.py +60 -0
  1229. vllm/transformers_utils/processor.py +299 -0
  1230. vllm/transformers_utils/processors/__init__.py +16 -0
  1231. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1232. vllm/transformers_utils/processors/ovis.py +420 -0
  1233. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1234. vllm/transformers_utils/runai_utils.py +104 -0
  1235. vllm/transformers_utils/s3_utils.py +93 -0
  1236. vllm/transformers_utils/tokenizer.py +292 -0
  1237. vllm/transformers_utils/tokenizer_base.py +154 -0
  1238. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1239. vllm/transformers_utils/tokenizers/mistral.py +521 -0
  1240. vllm/transformers_utils/utils.py +108 -0
  1241. vllm/triton_utils/__init__.py +16 -0
  1242. vllm/triton_utils/importing.py +96 -0
  1243. vllm/usage/__init__.py +0 -0
  1244. vllm/usage/usage_lib.py +259 -0
  1245. vllm/utils/__init__.py +3566 -0
  1246. vllm/utils/deep_gemm.py +319 -0
  1247. vllm/utils/flashinfer.py +443 -0
  1248. vllm/utils/jsontree.py +178 -0
  1249. vllm/utils/tensor_schema.py +235 -0
  1250. vllm/v1/__init__.py +0 -0
  1251. vllm/v1/attention/__init__.py +0 -0
  1252. vllm/v1/attention/backends/__init__.py +0 -0
  1253. vllm/v1/attention/backends/cpu_attn.py +919 -0
  1254. vllm/v1/attention/backends/flash_attn.py +795 -0
  1255. vllm/v1/attention/backends/flashinfer.py +1181 -0
  1256. vllm/v1/attention/backends/flex_attention.py +861 -0
  1257. vllm/v1/attention/backends/gdn_attn.py +332 -0
  1258. vllm/v1/attention/backends/linear_attn.py +67 -0
  1259. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1260. vllm/v1/attention/backends/mamba2_attn.py +232 -0
  1261. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1262. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1263. vllm/v1/attention/backends/mla/common.py +1783 -0
  1264. vllm/v1/attention/backends/mla/cutlass_mla.py +248 -0
  1265. vllm/v1/attention/backends/mla/flashattn_mla.py +271 -0
  1266. vllm/v1/attention/backends/mla/flashinfer_mla.py +114 -0
  1267. vllm/v1/attention/backends/mla/flashmla.py +203 -0
  1268. vllm/v1/attention/backends/mla/flashmla_sparse.py +544 -0
  1269. vllm/v1/attention/backends/mla/indexer.py +342 -0
  1270. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1271. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1272. vllm/v1/attention/backends/pallas.py +409 -0
  1273. vllm/v1/attention/backends/rocm_aiter_fa.py +549 -0
  1274. vllm/v1/attention/backends/rocm_attn.py +426 -0
  1275. vllm/v1/attention/backends/short_conv_attn.py +94 -0
  1276. vllm/v1/attention/backends/tree_attn.py +451 -0
  1277. vllm/v1/attention/backends/triton_attn.py +361 -0
  1278. vllm/v1/attention/backends/utils.py +990 -0
  1279. vllm/v1/attention/backends/xformers.py +438 -0
  1280. vllm/v1/core/__init__.py +0 -0
  1281. vllm/v1/core/block_pool.py +416 -0
  1282. vllm/v1/core/encoder_cache_manager.py +333 -0
  1283. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1284. vllm/v1/core/kv_cache_manager.py +399 -0
  1285. vllm/v1/core/kv_cache_utils.py +1291 -0
  1286. vllm/v1/core/sched/__init__.py +0 -0
  1287. vllm/v1/core/sched/async_scheduler.py +47 -0
  1288. vllm/v1/core/sched/interface.py +158 -0
  1289. vllm/v1/core/sched/output.py +166 -0
  1290. vllm/v1/core/sched/request_queue.py +224 -0
  1291. vllm/v1/core/sched/scheduler.py +1296 -0
  1292. vllm/v1/core/sched/utils.py +69 -0
  1293. vllm/v1/core/single_type_kv_cache_manager.py +671 -0
  1294. vllm/v1/cudagraph_dispatcher.py +125 -0
  1295. vllm/v1/engine/__init__.py +203 -0
  1296. vllm/v1/engine/async_llm.py +742 -0
  1297. vllm/v1/engine/coordinator.py +357 -0
  1298. vllm/v1/engine/core.py +1235 -0
  1299. vllm/v1/engine/core_client.py +1334 -0
  1300. vllm/v1/engine/detokenizer.py +349 -0
  1301. vllm/v1/engine/exceptions.py +17 -0
  1302. vllm/v1/engine/llm_engine.py +370 -0
  1303. vllm/v1/engine/logprobs.py +201 -0
  1304. vllm/v1/engine/output_processor.py +576 -0
  1305. vllm/v1/engine/parallel_sampling.py +133 -0
  1306. vllm/v1/engine/processor.py +545 -0
  1307. vllm/v1/engine/utils.py +860 -0
  1308. vllm/v1/executor/__init__.py +0 -0
  1309. vllm/v1/executor/abstract.py +137 -0
  1310. vllm/v1/executor/multiproc_executor.py +726 -0
  1311. vllm/v1/executor/ray_distributed_executor.py +108 -0
  1312. vllm/v1/executor/utils.py +23 -0
  1313. vllm/v1/kv_cache_interface.py +375 -0
  1314. vllm/v1/kv_offload/__init__.py +0 -0
  1315. vllm/v1/kv_offload/abstract.py +165 -0
  1316. vllm/v1/kv_offload/backend.py +96 -0
  1317. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1318. vllm/v1/kv_offload/backends/cpu.py +61 -0
  1319. vllm/v1/kv_offload/cpu.py +75 -0
  1320. vllm/v1/kv_offload/factory.py +56 -0
  1321. vllm/v1/kv_offload/lru_manager.py +132 -0
  1322. vllm/v1/kv_offload/mediums.py +39 -0
  1323. vllm/v1/kv_offload/spec.py +61 -0
  1324. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1325. vllm/v1/kv_offload/worker/cpu_gpu.py +171 -0
  1326. vllm/v1/kv_offload/worker/worker.py +142 -0
  1327. vllm/v1/metrics/__init__.py +0 -0
  1328. vllm/v1/metrics/loggers.py +741 -0
  1329. vllm/v1/metrics/prometheus.py +82 -0
  1330. vllm/v1/metrics/ray_wrappers.py +152 -0
  1331. vllm/v1/metrics/reader.py +246 -0
  1332. vllm/v1/metrics/stats.py +257 -0
  1333. vllm/v1/outputs.py +161 -0
  1334. vllm/v1/pool/__init__.py +0 -0
  1335. vllm/v1/pool/metadata.py +77 -0
  1336. vllm/v1/request.py +241 -0
  1337. vllm/v1/sample/__init__.py +0 -0
  1338. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1339. vllm/v1/sample/logits_processor/builtin.py +275 -0
  1340. vllm/v1/sample/logits_processor/interface.py +97 -0
  1341. vllm/v1/sample/logits_processor/state.py +161 -0
  1342. vllm/v1/sample/metadata.py +43 -0
  1343. vllm/v1/sample/ops/__init__.py +0 -0
  1344. vllm/v1/sample/ops/bad_words.py +39 -0
  1345. vllm/v1/sample/ops/logprobs.py +26 -0
  1346. vllm/v1/sample/ops/penalties.py +43 -0
  1347. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1348. vllm/v1/sample/rejection_sampler.py +623 -0
  1349. vllm/v1/sample/sampler.py +285 -0
  1350. vllm/v1/sample/tpu/__init__.py +0 -0
  1351. vllm/v1/sample/tpu/metadata.py +124 -0
  1352. vllm/v1/sample/tpu/sampler.py +213 -0
  1353. vllm/v1/serial_utils.py +423 -0
  1354. vllm/v1/spec_decode/__init__.py +0 -0
  1355. vllm/v1/spec_decode/eagle.py +1011 -0
  1356. vllm/v1/spec_decode/medusa.py +66 -0
  1357. vllm/v1/spec_decode/metadata.py +62 -0
  1358. vllm/v1/spec_decode/metrics.py +211 -0
  1359. vllm/v1/spec_decode/ngram_proposer.py +276 -0
  1360. vllm/v1/spec_decode/utils.py +14 -0
  1361. vllm/v1/structured_output/__init__.py +295 -0
  1362. vllm/v1/structured_output/backend_guidance.py +245 -0
  1363. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1364. vllm/v1/structured_output/backend_outlines.py +320 -0
  1365. vllm/v1/structured_output/backend_types.py +134 -0
  1366. vllm/v1/structured_output/backend_xgrammar.py +327 -0
  1367. vllm/v1/structured_output/request.py +86 -0
  1368. vllm/v1/structured_output/utils.py +454 -0
  1369. vllm/v1/utils.py +396 -0
  1370. vllm/v1/worker/__init__.py +0 -0
  1371. vllm/v1/worker/block_table.py +210 -0
  1372. vllm/v1/worker/cpu_model_runner.py +175 -0
  1373. vllm/v1/worker/cpu_worker.py +156 -0
  1374. vllm/v1/worker/gpu_input_batch.py +863 -0
  1375. vllm/v1/worker/gpu_model_runner.py +4160 -0
  1376. vllm/v1/worker/gpu_ubatch_wrapper.py +399 -0
  1377. vllm/v1/worker/gpu_worker.py +710 -0
  1378. vllm/v1/worker/kv_connector_model_runner_mixin.py +132 -0
  1379. vllm/v1/worker/lora_model_runner_mixin.py +183 -0
  1380. vllm/v1/worker/tpu_input_batch.py +587 -0
  1381. vllm/v1/worker/tpu_model_runner.py +1946 -0
  1382. vllm/v1/worker/tpu_worker.py +346 -0
  1383. vllm/v1/worker/ubatch_splitting.py +192 -0
  1384. vllm/v1/worker/ubatch_utils.py +27 -0
  1385. vllm/v1/worker/ubatching.py +224 -0
  1386. vllm/v1/worker/utils.py +344 -0
  1387. vllm/v1/worker/worker_base.py +65 -0
  1388. vllm/v1/worker/xpu_model_runner.py +57 -0
  1389. vllm/v1/worker/xpu_worker.py +179 -0
  1390. vllm/version.py +41 -0
  1391. vllm/vllm_flash_attn/.gitkeep +0 -0
  1392. vllm/worker/__init__.py +0 -0
  1393. vllm/worker/worker_base.py +279 -0
  1394. vllm_cpu-0.11.0.post2.dist-info/METADATA +348 -0
  1395. vllm_cpu-0.11.0.post2.dist-info/RECORD +1398 -0
  1396. vllm_cpu-0.11.0.post2.dist-info/WHEEL +5 -0
  1397. vllm_cpu-0.11.0.post2.dist-info/entry_points.txt +5 -0
  1398. vllm_cpu-0.11.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1473 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ import contextlib
4
+ import copy
5
+ import logging
6
+ import math
7
+ import queue
8
+ import threading
9
+ import time
10
+ import uuid
11
+ from collections import defaultdict
12
+ from collections.abc import Iterator
13
+ from concurrent.futures import Future, ThreadPoolExecutor
14
+ from dataclasses import dataclass
15
+ from typing import TYPE_CHECKING, Any, Optional, Union
16
+
17
+ import msgspec
18
+ import numpy as np
19
+ import torch
20
+ import zmq
21
+
22
+ from vllm import envs
23
+ from vllm.attention.selector import backend_name_to_enum, get_attn_backend
24
+ from vllm.config import VllmConfig
25
+ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
26
+ CopyBlocksOp, KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
27
+ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
28
+ KVConnectorStats)
29
+ from vllm.distributed.parallel_state import (
30
+ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
31
+ get_tp_group)
32
+ from vllm.distributed.utils import divide
33
+ from vllm.forward_context import ForwardContext
34
+ from vllm.logger import init_logger
35
+ from vllm.platforms import _Backend, current_platform
36
+ from vllm.utils import make_zmq_path, make_zmq_socket
37
+ from vllm.v1.attention.backends.utils import get_kv_cache_layout
38
+ from vllm.v1.core.sched.output import SchedulerOutput
39
+
40
+ if TYPE_CHECKING:
41
+ from vllm.attention.backends.abstract import AttentionMetadata
42
+ from vllm.v1.core.kv_cache_manager import KVCacheBlocks
43
+ from vllm.v1.request import Request
44
+
45
+ Transfer = tuple[int, float] # (xfer_handle, start_time)
46
+ EngineId = str
47
+ ReqId = str
48
+
49
+ GET_META_MSG = b"get_meta_msg"
50
+
51
+ logger = init_logger(__name__)
52
+
53
+ # Lazy import nixl_wrapper to avoid loading nixl_bindings if nixl is not used
54
+ try:
55
+ from nixl._api import nixl_agent as NixlWrapper
56
+ logger.info("NIXL is available")
57
+ except ImportError:
58
+ logger.warning("NIXL is not available")
59
+ NixlWrapper = None
60
+
61
+ try:
62
+ from nixl._api import nixl_agent_config
63
+ except ImportError:
64
+ nixl_agent_config = None
65
+ logger.warning("NIXL agent config is not available")
66
+
67
+ # Supported platforms and types of kv transfer buffer.
68
+ # {device: tuple of supported kv buffer types}
69
+ _NIXL_SUPPORTED_DEVICE = {
70
+ "cuda": ("cuda", ),
71
+ "tpu": ("cpu", ),
72
+ "xpu": ("cpu", ),
73
+ }
74
+ # support for oot platform by providing mapping in current_platform
75
+ _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices())
76
+
77
+
78
+ class NixlAgentMetadata(
79
+ msgspec.Struct,
80
+ omit_defaults=True, # type: ignore[call-arg]
81
+ # required for @cached_property.
82
+ dict=True):
83
+ engine_id: str
84
+ agent_metadata: bytes
85
+ kv_caches_base_addr: list[int]
86
+ num_blocks: int
87
+ block_lens: list[int]
88
+ attn_backend_name: str
89
+ kv_cache_layout: str
90
+
91
+
92
+ @dataclass
93
+ class ReqMeta:
94
+ local_block_ids: list[int]
95
+ remote_block_ids: list[int]
96
+ remote_host: str
97
+ remote_port: int
98
+ remote_engine_id: str
99
+ tp_size: int
100
+
101
+
102
+ class NixlConnectorMetadata(KVConnectorMetadata):
103
+
104
+ def __init__(self):
105
+ self.reqs_to_recv: dict[ReqId, ReqMeta] = {}
106
+ self.reqs_to_save: dict[ReqId, ReqMeta] = {}
107
+ self.reqs_to_send: dict[ReqId, float] = {}
108
+ self.reqs_in_batch: set[ReqId] = set()
109
+
110
+ def add_new_req(
111
+ self,
112
+ request_id: ReqId,
113
+ local_block_ids: list[int],
114
+ kv_transfer_params: dict[str, Any],
115
+ load_remote_cache: bool = True,
116
+ save_to_host: bool = False,
117
+ ):
118
+ # save and load are mutually exclusive
119
+ assert load_remote_cache ^ save_to_host
120
+ _req = ReqMeta(
121
+ local_block_ids=local_block_ids,
122
+ remote_block_ids=kv_transfer_params["remote_block_ids"],
123
+ remote_engine_id=kv_transfer_params["remote_engine_id"],
124
+ remote_host=kv_transfer_params["remote_host"],
125
+ remote_port=kv_transfer_params["remote_port"],
126
+ # P workers don't need to receive tp_size from proxy here.
127
+ tp_size=kv_transfer_params.get("tp_size", 1),
128
+ )
129
+ if save_to_host:
130
+ self.reqs_to_save[request_id] = _req
131
+ if load_remote_cache:
132
+ self.reqs_to_recv[request_id] = _req
133
+
134
+
135
+ class NixlConnector(KVConnectorBase_V1):
136
+
137
+ def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole):
138
+ assert vllm_config.kv_transfer_config is not None
139
+ assert vllm_config.kv_transfer_config.engine_id is not None
140
+ self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
141
+
142
+ if role == KVConnectorRole.SCHEDULER:
143
+ self.connector_scheduler: Optional[NixlConnectorScheduler] = \
144
+ NixlConnectorScheduler(vllm_config, self.engine_id)
145
+ self.connector_worker: Optional[NixlConnectorWorker] = None
146
+ elif role == KVConnectorRole.WORKER:
147
+ self.connector_scheduler = None
148
+ self.connector_worker = NixlConnectorWorker(
149
+ vllm_config, self.engine_id)
150
+
151
+ ############################################################
152
+ # Class Methods
153
+ ############################################################
154
+ @classmethod
155
+ def get_required_kvcache_layout(cls, vllm_config: VllmConfig):
156
+ if vllm_config.model_config is None:
157
+ logger.warning_once("Unable to detect current VLLM config. "
158
+ "Fallback to default kv cache layout.")
159
+ return None
160
+ use_mla = vllm_config.model_config.use_mla
161
+ if use_mla:
162
+ # return None when we have mla
163
+ # as the layout should not matter in that case,
164
+ # which fallback to the default behavior.
165
+ return None
166
+ logger.info_once("NixlConnector setting KV cache "
167
+ "layout to HND for better xfer performance.")
168
+ return "HND"
169
+
170
+ ############################################################
171
+ # Scheduler Side Methods
172
+ ############################################################
173
+
174
+ def get_num_new_matched_tokens(
175
+ self, request: "Request",
176
+ num_computed_tokens: int) -> tuple[Optional[int], bool]:
177
+ assert self.connector_scheduler is not None
178
+ return self.connector_scheduler.get_num_new_matched_tokens(
179
+ request, num_computed_tokens)
180
+
181
+ def update_state_after_alloc(self, request: "Request",
182
+ blocks: "KVCacheBlocks",
183
+ num_external_tokens: int):
184
+ assert self.connector_scheduler is not None
185
+ return self.connector_scheduler.update_state_after_alloc(
186
+ request, blocks, num_external_tokens)
187
+
188
+ def build_connector_meta(
189
+ self,
190
+ scheduler_output: SchedulerOutput,
191
+ ) -> KVConnectorMetadata:
192
+ assert self.connector_scheduler is not None
193
+ return self.connector_scheduler.build_connector_meta(scheduler_output)
194
+
195
+ def request_finished(
196
+ self,
197
+ request: "Request",
198
+ block_ids: list[int],
199
+ ) -> tuple[bool, Optional[dict[str, Any]]]:
200
+ assert self.connector_scheduler is not None
201
+ return self.connector_scheduler.request_finished(request, block_ids)
202
+
203
+ ############################################################
204
+ # Worker Side Methods
205
+ ############################################################
206
+ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
207
+ assert self.connector_worker is not None
208
+ self.connector_worker.register_kv_caches(kv_caches)
209
+
210
+ def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
211
+ assert self.connector_worker is not None
212
+ self.connector_worker.set_host_xfer_buffer_ops(copy_operation)
213
+
214
+ def get_finished(self,
215
+ finished_req_ids: set[str]) -> tuple[set[str], set[str]]:
216
+ """Get the finished recving and sending requests."""
217
+ assert self.connector_worker is not None
218
+ return self.connector_worker.get_finished()
219
+
220
+ def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
221
+ assert self.connector_worker is not None
222
+ return self.connector_worker.get_kv_connector_stats()
223
+
224
+ @classmethod
225
+ def build_kv_connector_stats(
226
+ cls,
227
+ data: Optional[dict[str,
228
+ Any]] = None) -> Optional[KVConnectorStats]:
229
+ return NixlKVConnectorStats(data=data) if data is not None \
230
+ else NixlKVConnectorStats()
231
+
232
+ def start_load_kv(self, forward_context: "ForwardContext",
233
+ **kwargs) -> None:
234
+ assert self.connector_worker is not None
235
+ assert isinstance(self._connector_metadata, NixlConnectorMetadata)
236
+ self.connector_worker.start_load_kv(self._connector_metadata)
237
+
238
+ def wait_for_layer_load(self, layer_name: str) -> None:
239
+ """NixlConnector does not do layerwise saving."""
240
+ pass
241
+
242
+ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
243
+ attn_metadata: "AttentionMetadata", **kwargs) -> None:
244
+ """NixlConnector does not save explicitly."""
245
+ pass
246
+
247
+ def wait_for_save(self):
248
+ assert self.connector_worker is not None
249
+ assert isinstance(self._connector_metadata, NixlConnectorMetadata)
250
+ if self.connector_worker.use_host_buffer and \
251
+ self.connector_worker.copy_blocks:
252
+ self.connector_worker.save_kv_to_host(self._connector_metadata)
253
+
254
+ def shutdown(self):
255
+ if self.connector_worker is not None:
256
+ self.connector_worker.shutdown()
257
+
258
+
259
+ class NixlConnectorScheduler:
260
+ """Implementation of Scheduler side methods"""
261
+
262
+ def __init__(self, vllm_config: VllmConfig, engine_id: str):
263
+ self.vllm_config = vllm_config
264
+ self.block_size = vllm_config.cache_config.block_size
265
+ self.engine_id: EngineId = engine_id
266
+ self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
267
+ self.side_channel_port = (
268
+ envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
269
+ vllm_config.parallel_config.data_parallel_rank *
270
+ vllm_config.parallel_config.tensor_parallel_size)
271
+ self.use_host_buffer = \
272
+ vllm_config.kv_transfer_config.kv_buffer_device == "cpu"
273
+ logger.info("Initializing NIXL Scheduler %s", engine_id)
274
+
275
+ # Requests that need to start recv/send.
276
+ # New requests are added by update_state_after_alloc in
277
+ # the scheduler. Used to make metadata passed to Worker.
278
+ self._reqs_need_recv: dict[ReqId, tuple[Request, list[int]]] = {}
279
+ self._reqs_need_save: dict[ReqId, tuple[Request, list[int]]] = {}
280
+ # Reqs to send and their expiration time
281
+ self._reqs_need_send: dict[ReqId, float] = {}
282
+ self._reqs_in_batch: set[ReqId] = set()
283
+
284
+ def get_num_new_matched_tokens(
285
+ self, request: "Request",
286
+ num_computed_tokens: int) -> tuple[int, bool]:
287
+ """
288
+ For remote prefill, pull all prompt blocks from remote
289
+ asynchronously relative to engine execution.
290
+
291
+ Args:
292
+ request (Request): the request object.
293
+ num_computed_tokens (int): the number of locally
294
+ computed tokens for this request
295
+ Returns:
296
+ * the number of tokens that can be loaded from the
297
+ external KV cache beyond what is already computed.
298
+ * true if the external KV cache tokens will be loaded
299
+ asynchronously (between scheduler steps).
300
+ """
301
+
302
+ params = request.kv_transfer_params
303
+ logger.debug(
304
+ "NIXLConnector get_num_new_matched_tokens: "
305
+ "num_computed_tokens=%s, kv_transfer_params=%s",
306
+ num_computed_tokens, params)
307
+
308
+ if params is not None and params.get("do_remote_prefill"):
309
+ # Remote prefill: get all prompt blocks from remote.
310
+ count = len(request.prompt_token_ids) - num_computed_tokens
311
+ if count > 0:
312
+ return count, True
313
+
314
+ # No remote prefill for this request.
315
+ return 0, False
316
+
317
+ def update_state_after_alloc(self, request: "Request",
318
+ blocks: "KVCacheBlocks",
319
+ num_external_tokens: int):
320
+
321
+ params = request.kv_transfer_params
322
+ logger.debug(
323
+ "NIXLConnector update_state_after_alloc: "
324
+ "num_external_tokens=%s, kv_transfer_params=%s",
325
+ num_external_tokens, params)
326
+
327
+ if not params:
328
+ return
329
+
330
+ if params.get("do_remote_decode"):
331
+ self._reqs_in_batch.add(request.request_id)
332
+ if self.use_host_buffer and params.get("do_remote_decode"):
333
+ # NOTE: when accelerator is not directly supported by Nixl,
334
+ # prefilled blocks need to be saved to host memory before transfer.
335
+
336
+ # save all blocks
337
+ block_ids = blocks.get_block_ids()[0]
338
+ # TODO: skip the blocks that are already in the host xfer buffer.
339
+ # Currently, the host xfer buffer block is 1-to-1 mapped to device
340
+ # kv blocks, so host blocks won't be flushed as long as its device
341
+ # block is not overwritten; and it will be safe to skip saving them
342
+ # to host xfer buffer.
343
+ if block_ids:
344
+ self._reqs_need_save[request.request_id] = \
345
+ (request, block_ids)
346
+ elif params.get("do_remote_prefill"):
347
+ if params.get("remote_block_ids"):
348
+ if all(p in params for p in ("remote_engine_id", "remote_host",
349
+ "remote_port")):
350
+ # If remote_blocks and num_external_tokens = 0, we have
351
+ # a full prefix cache hit on the D worker. We need to call
352
+ # send_notif in _read_blocks to free the memory on the P.
353
+ local_block_ids = (blocks.get_unhashed_block_ids()
354
+ if num_external_tokens > 0 else [])
355
+ # Get unhashed blocks to pull from remote.
356
+ self._reqs_need_recv[request.request_id] = (
357
+ request, local_block_ids)
358
+
359
+ else:
360
+ logger.warning(
361
+ "Got invalid KVTransferParams: %s. This "
362
+ "request will not utilize KVTransfer", params)
363
+ else:
364
+ assert num_external_tokens == 0
365
+ # Only trigger 1 KV transfer per request.
366
+ params["do_remote_prefill"] = False
367
+
368
+ def build_connector_meta(
369
+ self,
370
+ scheduler_output: SchedulerOutput,
371
+ ) -> KVConnectorMetadata:
372
+ meta = NixlConnectorMetadata()
373
+
374
+ # Loop through scheduled reqs and convert to ReqMeta.
375
+ for req_id, (req, block_ids) in self._reqs_need_recv.items():
376
+ assert req.kv_transfer_params is not None
377
+ meta.add_new_req(
378
+ request_id=req_id,
379
+ local_block_ids=block_ids,
380
+ kv_transfer_params=req.kv_transfer_params,
381
+ load_remote_cache=True,
382
+ save_to_host=False,
383
+ )
384
+
385
+ for req_id, (req, block_ids) in self._reqs_need_save.items():
386
+ assert req.kv_transfer_params is not None
387
+ meta.add_new_req(
388
+ request_id=req_id,
389
+ local_block_ids=block_ids,
390
+ kv_transfer_params=req.kv_transfer_params,
391
+ load_remote_cache=False,
392
+ save_to_host=True,
393
+ )
394
+
395
+ meta.reqs_to_send = self._reqs_need_send
396
+ meta.reqs_in_batch = self._reqs_in_batch
397
+
398
+ # Clear the list once workers start the transfers
399
+ self._reqs_need_recv.clear()
400
+ self._reqs_need_save.clear()
401
+ self._reqs_in_batch = set()
402
+ self._reqs_need_send = {}
403
+
404
+ return meta
405
+
406
+ def request_finished(
407
+ self,
408
+ request: "Request",
409
+ block_ids: list[int],
410
+ ) -> tuple[bool, Optional[dict[str, Any]]]:
411
+ """
412
+ Once a request is finished, determine whether request blocks
413
+ should be freed now or will be sent asynchronously and freed later.
414
+ """
415
+ from vllm.v1.request import RequestStatus
416
+
417
+ params = request.kv_transfer_params
418
+ logger.debug(
419
+ "NIXLConnector request_finished, request_status=%s, "
420
+ "kv_transfer_params=%s", request.status, params)
421
+ if not params:
422
+ return False, None
423
+
424
+ if params.get("do_remote_prefill"):
425
+ # If do_remote_prefill is still True when the request is finished,
426
+ # update_state_after_alloc must not have been called (the request
427
+ # must have been aborted before it was scheduled).
428
+ # To avoid stranding the prefill blocks in the prefill instance,
429
+ # we must add empty block_ids to _reqs_need_recv so that our
430
+ # worker side will notify and free blocks in the prefill instance.
431
+ self._reqs_need_recv[request.request_id] = (request, [])
432
+ params["do_remote_prefill"] = False
433
+ return False, None
434
+
435
+ if (not params.get("do_remote_decode")
436
+ or request.status != RequestStatus.FINISHED_LENGTH_CAPPED):
437
+ return False, None
438
+
439
+ # TODO: check whether block_ids actually ever be 0. If not we could
440
+ # remove the conditional below
441
+ delay_free_blocks = len(block_ids) > 0
442
+
443
+ if delay_free_blocks:
444
+ # Prefill request on remote. It will be read from D upon completion
445
+ self._reqs_need_send[request.request_id] = time.perf_counter(
446
+ ) + envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT
447
+
448
+ return delay_free_blocks, dict(
449
+ do_remote_prefill=True,
450
+ do_remote_decode=False,
451
+ remote_block_ids=block_ids,
452
+ remote_engine_id=self.engine_id,
453
+ remote_host=self.side_channel_host,
454
+ remote_port=self.side_channel_port,
455
+ tp_size=self.vllm_config.parallel_config.tensor_parallel_size)
456
+
457
+
458
+ class NixlConnectorWorker:
459
+ """Implementation of Worker side methods"""
460
+
461
+ def __init__(self, vllm_config: VllmConfig, engine_id: str):
462
+ if NixlWrapper is None:
463
+ logger.error("NIXL is not available")
464
+ raise RuntimeError("NIXL is not available")
465
+ logger.info("Initializing NIXL wrapper")
466
+ logger.info("Initializing NIXL worker %s", engine_id)
467
+
468
+ # Config.
469
+ self.vllm_config = vllm_config
470
+ self.block_size = vllm_config.cache_config.block_size
471
+
472
+ self.nixl_backends = \
473
+ vllm_config.kv_transfer_config.get_from_extra_config(
474
+ "backends", ["UCX"])
475
+ # Agent.
476
+ non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
477
+ if nixl_agent_config is None:
478
+ config = None
479
+ else:
480
+ config = nixl_agent_config(backends=self.nixl_backends) if len(
481
+ non_ucx_backends) > 0 else nixl_agent_config(num_threads=8)
482
+
483
+ self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
484
+ # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}.
485
+ self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict)
486
+
487
+ # NIXL handshake port.
488
+ # NOTE(rob): Within a DP group, each DP rank gets its own
489
+ # base port (which is sent in the KVTransferParams).
490
+ # Each TP rank listens/queries on the base_port + tp_rank.
491
+ self.side_channel_port: int = (
492
+ envs.VLLM_NIXL_SIDE_CHANNEL_PORT +
493
+ vllm_config.parallel_config.data_parallel_rank *
494
+ vllm_config.parallel_config.tensor_parallel_size)
495
+
496
+ # Metadata.
497
+ self.engine_id: EngineId = engine_id
498
+ self.tp_rank = get_tensor_model_parallel_rank()
499
+ self.world_size = get_tensor_model_parallel_world_size()
500
+ self.tp_group = get_tp_group()
501
+ self.num_blocks = 0
502
+
503
+ # KV Caches and nixl tracking data.
504
+ self.device_type = current_platform.device_type
505
+ self.kv_buffer_device: str = \
506
+ vllm_config.kv_transfer_config.kv_buffer_device
507
+ if self.device_type not in _NIXL_SUPPORTED_DEVICE:
508
+ raise RuntimeError(f"{self.device_type} is not supported.")
509
+ elif self.kv_buffer_device not in _NIXL_SUPPORTED_DEVICE[
510
+ self.device_type]:
511
+ raise RuntimeError(
512
+ f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
513
+ "is not supported.")
514
+ self.device_kv_caches: dict[str, torch.Tensor] = {}
515
+
516
+ # cpu kv buffer for xfer
517
+ # used when device memory can not be registered under nixl
518
+ self.host_xfer_buffers: dict[str, torch.Tensor] = {}
519
+ self.use_host_buffer = self.kv_buffer_device == "cpu"
520
+ # support for oot platform which can't register nixl memory
521
+ # type based on kv_buffer_device
522
+ self.nixl_memory_type = current_platform.get_nixl_memory_type()
523
+ if self.nixl_memory_type is None:
524
+ if self.kv_buffer_device == "cuda":
525
+ self.nixl_memory_type = "VRAM"
526
+ elif self.kv_buffer_device == "cpu":
527
+ self.nixl_memory_type = "DRAM"
528
+ if self.nixl_memory_type is None:
529
+ raise RuntimeError(
530
+ f"{self.device_type} with {self.kv_buffer_device} kv_buffer "
531
+ "is not supported.")
532
+
533
+ # Note: host xfer buffer ops when use_host_buffer is True
534
+ self.copy_blocks: Optional[CopyBlocksOp] = None
535
+
536
+ # Map of engine_id -> kv_caches_base_addr. For TP case, each local
537
+ # rank will still only pull from a single remote TP worker.
538
+ self.kv_caches_base_addr: dict[EngineId, list[int]] = {}
539
+
540
+ # Number of NIXL regions. Currently one region per cache
541
+ # (so 1 per layer for MLA, otherwise 2 per layer)
542
+ self.num_regions = 0
543
+ self.num_layers = 0
544
+
545
+ # nixl_prepped_dlist_handle.
546
+ self.src_xfer_side_handle: int = 0
547
+ # Map of engine_id -> nixl_prepped_dlist_handle (int)].
548
+ self.dst_xfer_side_handles: dict[EngineId, int] = {}
549
+
550
+ # Map of engine_id -> num_blocks. All ranks in the same deployment will
551
+ # have the same number of blocks.
552
+ self.dst_num_blocks: dict[EngineId, int] = {}
553
+ self._registered_descs: list[Any] = []
554
+
555
+ # In progress transfers.
556
+ # [req_id -> list[handle]]
557
+ self._recving_metadata: dict[ReqId, ReqMeta] = {}
558
+ self._recving_transfers = defaultdict[ReqId, list[Transfer]](list)
559
+ # Track the expiration time of requests that are waiting to be sent.
560
+ self._reqs_to_send: dict[ReqId, float] = {}
561
+ # Set of requests that have been part of a batch, regardless of status.
562
+ self._reqs_to_process: set[ReqId] = set()
563
+
564
+ # Background thread for handling new handshake requests.
565
+ self._nixl_handshake_listener_t: Optional[threading.Thread] = None
566
+ # Background thread for initializing new NIXL handshakes.
567
+ self._handshake_initiation_executor = ThreadPoolExecutor(
568
+ # NIXL is not guaranteed to be thread-safe, limit 1 worker.
569
+ max_workers=1,
570
+ thread_name_prefix="vllm-nixl-handshake-initiator")
571
+ self._ready_requests = queue.Queue[tuple[ReqId, ReqMeta]]()
572
+ self._handshake_futures: dict[EngineId, Future[dict[int, str]]] = {}
573
+ # Protects _handshake_futures and _remote_agents.
574
+ self._handshake_lock = threading.RLock()
575
+
576
+ self.vllm_config = vllm_config
577
+ self.block_size = vllm_config.cache_config.block_size
578
+ self.model_config = vllm_config.model_config
579
+ self.cache_config = vllm_config.cache_config
580
+
581
+ # TODO(mgoin): remove this once we have hybrid memory allocator
582
+ # Optimization for models with local attention (Llama 4)
583
+ # List of block window sizes for each layer for local attention
584
+ self.block_window_per_layer: list[Optional[int]] = []
585
+ self.use_mla = self.model_config.use_mla
586
+
587
+ backend = get_attn_backend(self.model_config.get_head_size(),
588
+ self.model_config.dtype,
589
+ self.cache_config.cache_dtype,
590
+ self.block_size,
591
+ use_mla=self.use_mla)
592
+ self.backend_name = backend.get_name()
593
+ attn_backend = backend_name_to_enum(self.backend_name)
594
+ self._use_flashinfer = attn_backend == _Backend.FLASHINFER
595
+ self._use_pallas = attn_backend == _Backend.PALLAS
596
+ self.kv_cache_layout = get_kv_cache_layout()
597
+ logger.debug("Detected attention backend %s", self.backend_name)
598
+ logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
599
+
600
+ self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
601
+ # With heterogeneous TP, P must wait for all assigned D TP workers to
602
+ # finish reading before safely freeing the blocks.
603
+ self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
604
+ self.xfer_stats = NixlKVConnectorStats()
605
+
606
+ @staticmethod
607
+ def _nixl_handshake_listener(metadata: NixlAgentMetadata,
608
+ ready_event: threading.Event, base_port: int,
609
+ tp_rank: int):
610
+ """Background thread for getting new NIXL handshakes."""
611
+ # NOTE(rob): this is a simple implementation. We will move
612
+ # to a better approach via HTTP endpoint soon.
613
+
614
+ encoder = msgspec.msgpack.Encoder()
615
+ encoded_data = encoder.encode(metadata)
616
+ size_in_bytes = len(encoded_data)
617
+ logger.debug("Size of encoded NixlAgentMetadata: %s bytes",
618
+ str(size_in_bytes))
619
+
620
+ # Listen for new requests for metadata.
621
+ host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST
622
+ path = make_zmq_path("tcp", host, base_port + tp_rank)
623
+ logger.debug("Starting listening on path: %s", path)
624
+ with zmq_ctx(zmq.ROUTER, path) as sock:
625
+ ready_event.set()
626
+ while True:
627
+ identity, _, msg = sock.recv_multipart()
628
+ if msg != GET_META_MSG:
629
+ logger.warning(
630
+ "Connection listener got unexpected message %s", msg)
631
+ sock.send_multipart((identity, b"", encoded_data))
632
+
633
+ def _nixl_handshake(
634
+ self,
635
+ host: str,
636
+ port: int,
637
+ remote_tp_size: int,
638
+ expected_engine_id: str,
639
+ ) -> dict[int, str]:
640
+ """Do a NIXL handshake with a remote instance."""
641
+
642
+ start_time = time.perf_counter()
643
+
644
+ # NOTE(rob): we need each rank to have a unique port. This is
645
+ # a hack to keep us moving. We will switch when moving to etcd
646
+ # or where we have a single ZMQ socket in the scheduler.
647
+
648
+ # Handshake only with the remote TP rank that current local rank will
649
+ # pull from. With homogeneous TP it happens to be the same rank_i.
650
+ tp_ratio = self._tp_size[self.engine_id] // remote_tp_size
651
+ p_remote_rank = self.tp_rank // tp_ratio
652
+ path = make_zmq_path("tcp", host, port + p_remote_rank)
653
+ logger.debug("Querying metadata on path: %s at remote rank %s", path,
654
+ p_remote_rank)
655
+
656
+ # Send query for the request.
657
+ with zmq_ctx(zmq.REQ, path) as sock:
658
+ sock.send(GET_META_MSG)
659
+ metadata_bytes = sock.recv()
660
+ decoder = msgspec.msgpack.Decoder(NixlAgentMetadata)
661
+ metadata = decoder.decode(metadata_bytes)
662
+ got_metadata_time = time.perf_counter()
663
+ logger.debug("NIXL handshake: get metadata took: %s",
664
+ got_metadata_time - start_time)
665
+
666
+ # Ensure engine id matches.
667
+ if metadata.engine_id != expected_engine_id:
668
+ raise RuntimeError(f"Remote NIXL agent engine ID mismatch. "
669
+ f"Expected {expected_engine_id},"
670
+ f"received {metadata.engine_id}.")
671
+
672
+ # Register Remote agent.
673
+ remote_agent_name = self.add_remote_agent(metadata, p_remote_rank,
674
+ remote_tp_size)
675
+ setup_agent_time = time.perf_counter()
676
+ logger.debug("NIXL handshake: add agent took: %s",
677
+ setup_agent_time - got_metadata_time)
678
+
679
+ # Remote rank -> agent name.
680
+ return {p_remote_rank: remote_agent_name}
681
+
682
+ def initialize_host_xfer_buffer(
683
+ self, kv_caches: dict[str, torch.Tensor]) -> None:
684
+ """
685
+ Initialize transfer buffer in CPU mem for accelerators
686
+ NOT directly supported by NIXL (e.g., tpu)
687
+ """
688
+ xfer_buffers: dict[str, torch.Tensor] = {}
689
+ try:
690
+ for layer_name, kv_cache in kv_caches.items():
691
+ kv_shape = kv_cache.shape
692
+ kv_dtype = kv_cache.dtype
693
+ xfer_buffers[layer_name] = torch.empty(kv_shape,
694
+ dtype=kv_dtype,
695
+ device="cpu")
696
+ except MemoryError as e:
697
+ logger.error("NIXLConnectorWorker gets %s.", e)
698
+ raise
699
+
700
+ self.host_xfer_buffers = xfer_buffers
701
+
702
+ def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
703
+ """Assign copy (d2h, h2d) operations when host buffer is used."""
704
+ assert self.use_host_buffer
705
+ self.copy_blocks = copy_operation
706
+
707
+ def _background_nixl_handshake(self, req_id: str,
708
+ remote_engine_id: EngineId, meta: ReqMeta):
709
+ # Do NIXL handshake in background and add to _ready_requests when done.
710
+ fut = self._handshake_futures.get(remote_engine_id)
711
+ if fut is None:
712
+ fut = self._handshake_initiation_executor.submit(
713
+ self._nixl_handshake, meta.remote_host, meta.remote_port,
714
+ meta.tp_size, remote_engine_id)
715
+ self._handshake_futures[remote_engine_id] = fut
716
+
717
+ def done_callback(f: Future[dict[int, str]], eid=remote_engine_id):
718
+ with self._handshake_lock:
719
+ del self._handshake_futures[eid]
720
+ try:
721
+ self._remote_agents[eid] = f.result()
722
+ except Exception:
723
+ logger.exception("Handshake with %s failed", eid)
724
+
725
+ fut.add_done_callback(done_callback)
726
+
727
+ # TODO: handle failure state of future in the
728
+ # callback, we want to fail the request in this case.
729
+ def request_ready(_f: Future[Any], entry=(req_id, meta)):
730
+ self._ready_requests.put(entry)
731
+
732
+ fut.add_done_callback(request_ready)
733
+
734
+ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
735
+ """Register the KV Cache data in nixl."""
736
+
737
+ if self.use_host_buffer:
738
+ self.initialize_host_xfer_buffer(kv_caches=kv_caches)
739
+ assert len(self.host_xfer_buffers) == len(kv_caches), (
740
+ f"host_buffer: {len(self.host_xfer_buffers)}, "
741
+ f"kv_caches: {len(kv_caches)}")
742
+ xfer_buffers = self.host_xfer_buffers
743
+ else:
744
+ xfer_buffers = kv_caches
745
+ assert not self.host_xfer_buffers, (
746
+ "host_xfer_buffer should not be initialized when "
747
+ f"kv_buffer_device is {self.kv_buffer_device}")
748
+
749
+ logger.info(
750
+ "Registering KV_Caches. use_mla: %s, kv_buffer_device: %s, "
751
+ "use_host_buffer: %s", self.use_mla, self.kv_buffer_device,
752
+ self.use_host_buffer)
753
+
754
+ caches_data = []
755
+ # With hybrid allocator, layers can share a kv cache tensor
756
+ seen_base_addresses = []
757
+
758
+ # Note(tms): I modified this from the original region setup code.
759
+ # K and V are now in different regions. Advantage is that we can
760
+ # elegantly support MLA and any cases where the K and V tensors
761
+ # are non-contiguous (it's not locally guaranteed that they will be)
762
+ # Disadvantage is that the encoded NixlAgentMetadata is now larger
763
+ # (roughly 8KB vs 5KB).
764
+ # Conversely for FlashInfer, K and V are registered in the same region
765
+ # to better exploit the memory layout (ie num_blocks is the first dim).
766
+ split_k_and_v = not (self.use_mla or self._use_pallas
767
+ or self._use_flashinfer)
768
+ tensor_size_bytes = None
769
+ # Enable different block lengths for different layers when MLA is used.
770
+ self.block_len_per_layer = list[int]()
771
+ self.slot_size_per_layer = list[int]() # HD bytes in kv terms
772
+ for layer_name, cache_or_caches in xfer_buffers.items():
773
+ cache_list = cache_or_caches if split_k_and_v else [
774
+ cache_or_caches
775
+ ]
776
+
777
+ for cache in cache_list:
778
+ base_addr = cache.data_ptr()
779
+ if base_addr in seen_base_addresses:
780
+ continue
781
+
782
+ seen_base_addresses.append(base_addr)
783
+ curr_tensor_size_bytes = cache.numel() * cache.element_size()
784
+
785
+ if tensor_size_bytes is None:
786
+ tensor_size_bytes = curr_tensor_size_bytes
787
+ self.num_blocks = cache.shape[0]
788
+
789
+ assert cache.shape[0] == self.num_blocks, \
790
+ "All kv cache tensors must have the same number of blocks"
791
+
792
+ self.block_len_per_layer.append(curr_tensor_size_bytes //
793
+ self.num_blocks)
794
+ self.slot_size_per_layer.append(self.block_len_per_layer[-1] //
795
+ self.block_size)
796
+
797
+ if not self.use_mla:
798
+ # Different kv cache shape is not supported by HeteroTP
799
+ assert tensor_size_bytes == curr_tensor_size_bytes, \
800
+ "All kv cache tensors must have the same size"
801
+ caches_data.append(
802
+ (base_addr, curr_tensor_size_bytes, self.tp_rank, ""))
803
+
804
+ logger.debug("Different block lengths collected: %s",
805
+ set(self.block_len_per_layer))
806
+ assert len(self.block_len_per_layer) == len(seen_base_addresses)
807
+ assert self.num_blocks != 0
808
+
809
+ self.kv_caches_base_addr[self.engine_id] = seen_base_addresses
810
+ self.num_regions = len(caches_data)
811
+ self.num_layers = len(xfer_buffers.keys())
812
+
813
+ descs = self.nixl_wrapper.get_reg_descs(caches_data,
814
+ self.nixl_memory_type)
815
+ logger.debug("Registering descs: %s", caches_data)
816
+ self.nixl_wrapper.register_memory(descs, backends=self.nixl_backends)
817
+ logger.debug("Done registering descs")
818
+ self._registered_descs.append(descs)
819
+
820
+ self.device_kv_caches = kv_caches
821
+ self.dst_num_blocks[self.engine_id] = self.num_blocks
822
+ if self._use_flashinfer:
823
+ for i in range(len(self.slot_size_per_layer)):
824
+ assert self.slot_size_per_layer[i] % 2 == 0
825
+ self.slot_size_per_layer[i] //= 2
826
+
827
+ # NOTE (NickLucche) When FlashInfer is used, memory is registered
828
+ # with joint KV for each block. This minimizes the overhead in
829
+ # registerMem allowing faster descs queries. In order to be able to
830
+ # split on kv_heads dim as required by heterogeneous TP, one must
831
+ # be able to index K/V separately. Hence we double the number
832
+ # of 'virtual' regions here and halve `block_len` below.
833
+ self.num_regions *= 2
834
+
835
+ # Register local/src descr for NIXL xfer.
836
+ blocks_data = []
837
+ for i, base_addr in enumerate(seen_base_addresses):
838
+ kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
839
+ # NOTE With heter-TP, more blocks are prepared than what are
840
+ # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
841
+ # could create fewer, but then _get_block_descs_ids needs to
842
+ # select agent_meta.num_blocks instead of self.num_blocks for
843
+ # local descr, and that makes handling regular flow less clean.
844
+ for block_id in range(self.num_blocks):
845
+ block_offset = block_id * self.block_len_per_layer[i]
846
+ addr = base_addr + block_offset
847
+ # (addr, len, device id)
848
+ blocks_data.append((addr, kv_block_len, self.tp_rank))
849
+
850
+ if self._use_flashinfer:
851
+ # Separate and interleave K/V regions to maintain the same
852
+ # descs ordering. This is needed for selecting contiguous heads
853
+ # when split across TP ranks.
854
+ for block_id in range(self.num_blocks):
855
+ block_offset = block_id * self.block_len_per_layer[i]
856
+ addr = base_addr + block_offset
857
+ # Register addresses for V cache (K registered first).
858
+ v_addr = addr + kv_block_len
859
+ blocks_data.append((v_addr, kv_block_len, self.tp_rank))
860
+ logger.debug("Created %s blocks for src engine %s and rank %s",
861
+ len(blocks_data), self.engine_id, self.tp_rank)
862
+
863
+ descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
864
+ self.nixl_memory_type)
865
+ # NIXL_INIT_AGENT to be used for preparations of local descs.
866
+ self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
867
+ "NIXL_INIT_AGENT", descs)
868
+
869
+ # TODO(mgoin): Hybrid memory allocator is currently disabled for
870
+ # models with local attention (Llama 4). Can remove this once enabled.
871
+ if self.vllm_config.model_config.hf_config.model_type == "llama4":
872
+ from transformers import Llama4TextConfig
873
+ assert isinstance(self.vllm_config.model_config.hf_text_config,
874
+ Llama4TextConfig)
875
+ llama4_config = self.vllm_config.model_config.hf_text_config
876
+ no_rope_layers = llama4_config.no_rope_layers
877
+ chunk_size = llama4_config.attention_chunk_size
878
+ chunk_block_size = math.ceil(chunk_size / self.block_size)
879
+ for layer_idx in range(self.num_layers):
880
+ # no_rope_layers[layer_idx] == 0 means NoPE (global)
881
+ # Any other value means RoPE (local chunked)
882
+ is_local_attention = no_rope_layers[layer_idx] != 0
883
+ block_window = chunk_block_size if is_local_attention else None
884
+ self.block_window_per_layer.append(block_window)
885
+ logger.debug("Llama 4 block window per layer mapping: %s",
886
+ self.block_window_per_layer)
887
+ assert len(self.block_window_per_layer) == self.num_layers
888
+
889
+ # After KV Caches registered, listen for new connections.
890
+ metadata = NixlAgentMetadata(
891
+ engine_id=self.engine_id,
892
+ agent_metadata=self.nixl_wrapper.get_agent_metadata(),
893
+ kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
894
+ num_blocks=self.num_blocks,
895
+ block_lens=self.block_len_per_layer,
896
+ attn_backend_name=self.backend_name,
897
+ kv_cache_layout=self.kv_cache_layout)
898
+ ready_event = threading.Event()
899
+ self._nixl_handshake_listener_t = threading.Thread(
900
+ target=self._nixl_handshake_listener,
901
+ args=(metadata, ready_event, self.side_channel_port, self.tp_rank),
902
+ daemon=True,
903
+ name="nixl_handshake_listener")
904
+ self._nixl_handshake_listener_t.start()
905
+ ready_event.wait() # Wait for listener ZMQ socket to be ready.
906
+
907
+ def add_remote_agent(self,
908
+ nixl_agent_meta: NixlAgentMetadata,
909
+ remote_tp_rank: int = 0,
910
+ remote_tp_size: int = 1) -> str:
911
+ """
912
+ Add the remote NIXL agent and prepare the descriptors for reading cache
913
+ blocks from remote.
914
+
915
+ In particular, handle both homogeneous and heterogeneous TP. The former
916
+ requires local rank_i to read from remote rank_i.
917
+ The latter, assuming D.world_size > P.world_size, requires that two or
918
+ more local TP worker share the xfer from a single TP worker.
919
+
920
+ Here's an example (non-MLA case):
921
+
922
+ rank_offset p_remote_tp_rank
923
+ (kv split no)
924
+ --------------------------------
925
+ 0 0 Worker0 ---- 1st half of KV ----> Worker0 [ KV Cache ]
926
+ /
927
+ 1 0 Worker1 ---- 2nd half of KV -----/
928
+
929
+ 0 1 Worker2 ---- 1st half of KV ----> Worker1 [ KV Cache ]
930
+ /
931
+ 1 1 Worker3 ---- 2nd half of KV -----/
932
+
933
+
934
+ Decoder TP workers Prefix TP workers
935
+ (world_size=4) (world_size=2)
936
+ tp_ratio = 4 // 2 = 2
937
+
938
+ Considering the KV Caches, if P-Worker_i has cache size [2, num_blocksP, kv_heads, block_size, head_dim]
939
+ then D-Worker_j has [2, num_blocksD, kv_heads//tp_ratio, block_size, head_dim]. Mind the "HND" layout format.
940
+ Assuming num_blocksD >= num_blocksP, D-Worker0 reads from P-Worker0 by preparing the kv_heads//tp_ratio
941
+ first heads from all the slots of all the blocks. D-Worker1 will do the same, but reading the second split
942
+ along the kv_heads dimension, and so forth until "tp_ratio" D TP workers have pulled from P-Worker0.
943
+
944
+ Note that the above will also hold true for the homogeneous TP case, where tp_ratio evaluates to 1.
945
+
946
+ Regarding MLA case, the cache is replicated across TP workers so the rank_offset will just always be 0
947
+ so that the whole cache is shared by "tp_ratio" D TP workers.
948
+ """ # noqa: E501
949
+ engine_id = nixl_agent_meta.engine_id
950
+ # TODO re-evaluate refreshing for scaling/recovery
951
+ if remote_tp_rank in self._remote_agents.get(engine_id, {}):
952
+ return self._remote_agents[engine_id][remote_tp_rank]
953
+
954
+ if engine_id not in self._tp_size:
955
+ self._tp_size[engine_id] = remote_tp_size
956
+ else:
957
+ assert self._tp_size[engine_id] == remote_tp_size
958
+ # TODO We may eventually want to skip enforcing the same attn backend.
959
+ assert nixl_agent_meta.attn_backend_name == self.backend_name
960
+
961
+ remote_agent_name = self.nixl_wrapper.add_remote_agent(
962
+ nixl_agent_meta.agent_metadata)
963
+
964
+ # Number of D TP workers reading from a single P TP worker. This is
965
+ # 1 when P and D `--tensor-parallel-size` match.
966
+ tp_ratio = divide(self._tp_size[self.engine_id],
967
+ self._tp_size[engine_id])
968
+ assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
969
+ assert not self._use_pallas or tp_ratio == 1, \
970
+ "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
971
+
972
+ # Handle tp_size>num_kv_heads: replicate KV cache.
973
+ total_num_kv_heads = self.model_config.get_total_num_kv_heads()
974
+ is_kv_replicated = self._tp_size[engine_id] // total_num_kv_heads >= 1
975
+
976
+ remote_block_len = nixl_agent_meta.block_lens[0]
977
+ if self.use_mla or is_kv_replicated:
978
+ # With replicated KV cache, only the number of blocks can differ.
979
+ assert self.block_len_per_layer == nixl_agent_meta.block_lens, \
980
+ "KV cache sizes must match between P and D when replicated"
981
+ remote_block_size = remote_block_len // (
982
+ self.slot_size_per_layer[0])
983
+ else:
984
+ # When MLA is not used, this is a list of the same block length
985
+ for block_len in nixl_agent_meta.block_lens:
986
+ assert block_len == remote_block_len, \
987
+ "All remote layers must have the same block size"
988
+ remote_block_size = remote_block_len // (
989
+ self.slot_size_per_layer[0] * tp_ratio)
990
+ if self._use_flashinfer:
991
+ # With flashinfer, KV are sent in the same message.
992
+ remote_block_size //= 2
993
+ if tp_ratio > 1:
994
+ # Heterogeneous TP expects same kv_cache_layout.
995
+ assert nixl_agent_meta.kv_cache_layout == self.kv_cache_layout
996
+ if self.device_type == "xpu":
997
+ raise ValueError(
998
+ "Heterogeneous TP is not supported on XPU")
999
+
1000
+ assert remote_block_len == self.block_len_per_layer[0] * tp_ratio, (
1001
+ "Remote P worker KV layer cache must be of shape [2, N, "
1002
+ "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
1003
+ )
1004
+
1005
+ assert self.block_size == remote_block_size, (
1006
+ "Remote P worker with different page/block size is not supported "
1007
+ f"{self.block_size=}, {remote_block_size=}")
1008
+
1009
+ # Create dst descs and xfer side handles. TP workers have same #blocks.
1010
+ if engine_id in self.dst_num_blocks:
1011
+ assert self.dst_num_blocks[engine_id] == nixl_agent_meta.num_blocks
1012
+ else:
1013
+ self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
1014
+
1015
+ blocks_data = []
1016
+ # With homogeneous TP, D pulls the whole kv cache from corresponding
1017
+ # rank. With heterogeneous TP, prepare the descriptors by splitting the
1018
+ # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
1019
+ # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
1020
+ self.kv_caches_base_addr[
1021
+ engine_id] = nixl_agent_meta.kv_caches_base_addr
1022
+
1023
+ assert len(nixl_agent_meta.kv_caches_base_addr) == len(
1024
+ self.block_len_per_layer)
1025
+ # Register all remote blocks, but only the corresponding kv heads.
1026
+ for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
1027
+ kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
1028
+ rank_offset = self.tp_rank % tp_ratio * kv_block_len \
1029
+ if not (self.use_mla or is_kv_replicated) else 0
1030
+ for block_id in range(nixl_agent_meta.num_blocks):
1031
+ block_offset = block_id * nixl_agent_meta.block_lens[i]
1032
+ # For each block, grab the heads chunk belonging to rank_i
1033
+ # of size remote_nheads // tp_ratio, which correspond to
1034
+ # self.block_len == remote_block_len//tp_ratio bytes.
1035
+ addr = base_addr + block_offset + rank_offset
1036
+ # (addr, len, device id)
1037
+ blocks_data.append((addr, kv_block_len, remote_tp_rank))
1038
+
1039
+ if self._use_flashinfer:
1040
+ # With FlashInfer index V separately to allow head splitting.
1041
+ for block_id in range(nixl_agent_meta.num_blocks):
1042
+ block_offset = block_id * nixl_agent_meta.block_lens[i]
1043
+ addr = base_addr + block_offset + rank_offset
1044
+ v_addr = addr + nixl_agent_meta.block_lens[i] // 2
1045
+ blocks_data.append((v_addr, kv_block_len, remote_tp_rank))
1046
+
1047
+ logger.debug(
1048
+ "Created %s blocks for dst engine %s with remote rank %s and "
1049
+ "local rank %s", len(blocks_data), engine_id, remote_tp_rank,
1050
+ self.tp_rank)
1051
+
1052
+ # Register with NIXL.
1053
+ descs = self.nixl_wrapper.get_xfer_descs(blocks_data,
1054
+ self.nixl_memory_type)
1055
+ self.dst_xfer_side_handles[
1056
+ engine_id] = self.nixl_wrapper.prep_xfer_dlist(
1057
+ remote_agent_name, descs)
1058
+
1059
+ return remote_agent_name
1060
+
1061
+ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
1062
+ """copy recved kv from host buffer to device."""
1063
+ assert self.use_host_buffer
1064
+ assert self.copy_blocks is not None
1065
+
1066
+ local_block_ids = meta.local_block_ids
1067
+ self.copy_blocks(self.host_xfer_buffers, self.device_kv_caches,
1068
+ local_block_ids, local_block_ids, "h2d")
1069
+ if logger.isEnabledFor(logging.DEBUG):
1070
+ logger.debug(
1071
+ "synced recved kv of request[%s] to device kv buffer,"
1072
+ "local_block_ids: %s. ", req_id,
1073
+ ",".join(map(str, meta.local_block_ids)))
1074
+
1075
+ def save_kv_to_host(self, metadata: NixlConnectorMetadata):
1076
+ """copy kv from device to host buffer."""
1077
+ assert self.use_host_buffer
1078
+ assert self.copy_blocks is not None
1079
+
1080
+ for req_id, meta in metadata.reqs_to_save.items():
1081
+ if logger.isEnabledFor(logging.DEBUG):
1082
+ logger.debug(
1083
+ "save_load_kv for request[%s] to host xfer buffer."
1084
+ "local_block_ids: %s. ", req_id,
1085
+ ",".join(map(str, meta.local_block_ids)))
1086
+ # blocking
1087
+ self.copy_blocks(self.device_kv_caches, self.host_xfer_buffers,
1088
+ meta.local_block_ids, meta.local_block_ids, "d2h")
1089
+
1090
+ def get_finished(self) -> tuple[set[str], set[str]]:
1091
+ """
1092
+ Get requests that are done sending or recving on this specific worker.
1093
+ The scheduler process (via the MultiprocExecutor) will use this output
1094
+ to track which workers are done.
1095
+ """
1096
+ done_sending = self._get_new_notifs()
1097
+ done_recving = self._pop_done_transfers(self._recving_transfers)
1098
+ if len(done_sending) > 0 or len(done_recving) > 0:
1099
+ logger.debug(
1100
+ "Rank %s, get_finished: %s requests done sending "
1101
+ "and %s requests done recving", self.tp_rank,
1102
+ len(done_sending), len(done_recving))
1103
+
1104
+ if self.use_host_buffer:
1105
+ for req_id in done_recving:
1106
+ meta = self._recving_metadata.pop(req_id)
1107
+ assert meta, f"{req_id} not found in recving_metadata list"
1108
+ self.sync_recved_kv_to_device(req_id, meta)
1109
+
1110
+ # Handle timeout to avoid stranding blocks on remote.
1111
+ now = time.perf_counter()
1112
+ while self._reqs_to_send:
1113
+ req_id, expires = next(iter(self._reqs_to_send.items()))
1114
+ # Sorted dict, oldest requests are put first so we can exit early.
1115
+ if now < expires:
1116
+ break
1117
+ count = self.consumer_notification_counts_by_req.pop(req_id, 0)
1118
+ logger.warning(
1119
+ "Releasing expired KV blocks for request %s which were "
1120
+ "retrieved by %d decode worker(s) within %d seconds.", req_id,
1121
+ count, envs.VLLM_NIXL_ABORT_REQUEST_TIMEOUT)
1122
+ self._reqs_to_process.remove(req_id)
1123
+ del self._reqs_to_send[req_id]
1124
+ done_sending.add(req_id)
1125
+
1126
+ return done_sending, done_recving
1127
+
1128
+ def _get_new_notifs(self) -> set[str]:
1129
+ """
1130
+ Get req_ids which got a remote xfer message. When multiple consumers
1131
+ are reading from the same producer (heterogeneous TP scenario), wait
1132
+ for all consumers to be done pulling.
1133
+ """
1134
+ notified_req_ids: set[str] = set()
1135
+ for notifs in self.nixl_wrapper.get_new_notifs().values():
1136
+ for notif in notifs:
1137
+ req_id, tp_ratio = notif.decode("utf-8").rsplit(":", 1)
1138
+ if (req_id not in self._reqs_to_send
1139
+ and req_id not in self._reqs_to_process):
1140
+ logger.error(
1141
+ "Potentially invalid KV blocks for "
1142
+ "unrecognized request %s were retrieved by "
1143
+ "a decode worker. They may have expired.", req_id)
1144
+ continue
1145
+
1146
+ self.consumer_notification_counts_by_req[req_id] += 1
1147
+ # Wait all consumers (D) to be done reading before freeing.
1148
+ if self.consumer_notification_counts_by_req[req_id] == int(
1149
+ tp_ratio):
1150
+ notified_req_ids.add(req_id)
1151
+ del self.consumer_notification_counts_by_req[req_id]
1152
+ self._reqs_to_process.remove(req_id)
1153
+ self._reqs_to_send.pop(req_id, None)
1154
+ return notified_req_ids
1155
+
1156
+ def _pop_done_transfers(
1157
+ self, transfers: dict[str, list[tuple[int, float]]]) -> set[str]:
1158
+ """
1159
+ Pop completed xfers by checking for DONE state.
1160
+ Args:
1161
+ transfers: dict of req_id -> list[running_xfer]
1162
+ Returns:
1163
+ set of req_ids that have all done xfers
1164
+ """
1165
+ done_req_ids: set[str] = set()
1166
+ for req_id, handles in list(transfers.items()):
1167
+ in_progress = False
1168
+ for handle, _xfer_stime in handles:
1169
+ xfer_state = self.nixl_wrapper.check_xfer_state(handle)
1170
+ if xfer_state == "DONE":
1171
+ self.nixl_wrapper.release_xfer_handle(handle)
1172
+ # TODO (NickLucche) Get from NIXL telemetry once integrated
1173
+ self.xfer_stats.record_transfer()
1174
+ elif xfer_state == "PROC":
1175
+ in_progress = True
1176
+ continue
1177
+ else:
1178
+ raise RuntimeError("Transfer failed with state %s",
1179
+ xfer_state)
1180
+ if not in_progress:
1181
+ done_req_ids.add(req_id)
1182
+ del transfers[req_id]
1183
+ return done_req_ids
1184
+
1185
+ def start_load_kv(self, metadata: NixlConnectorMetadata):
1186
+ """
1187
+ Start loading by triggering non-blocking nixl_xfer.
1188
+ We check for these trnxs to complete in each step().
1189
+ """
1190
+ for req_id, meta in metadata.reqs_to_recv.items():
1191
+ remote_engine_id = meta.remote_engine_id
1192
+ logger.debug(
1193
+ "start_load_kv for request %s from remote engine %s. "
1194
+ "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id,
1195
+ remote_engine_id, len(meta.local_block_ids),
1196
+ len(meta.remote_block_ids))
1197
+ if self.use_host_buffer:
1198
+ self._recving_metadata[req_id] = meta
1199
+ if remote_engine_id not in self._remote_agents:
1200
+ # Initiate handshake with remote engine to exchange metadata.
1201
+ with self._handshake_lock:
1202
+ if remote_engine_id not in self._remote_agents:
1203
+ self._background_nixl_handshake(
1204
+ req_id, remote_engine_id, meta)
1205
+ continue
1206
+
1207
+ # Handshake already completed, start async read xfer.
1208
+ self._read_blocks_for_req(req_id, meta)
1209
+
1210
+ # Start transfers for requests whose handshakes have now finished.
1211
+ while not self._ready_requests.empty():
1212
+ self._read_blocks_for_req(*self._ready_requests.get_nowait())
1213
+
1214
+ # Keep around the requests that have been part of a batch. This is
1215
+ # needed because async scheduling pushes the misalignment between the
1216
+ # moment in which requests expiration is set (P side) and the moment in
1217
+ # which blocks are read from D. As P can now more easily lag behind D
1218
+ # while processing the next batch, we make sure to only set an
1219
+ # expiration for requests that have not been read from D yet.
1220
+ for req_id in metadata.reqs_in_batch:
1221
+ self._reqs_to_process.add(req_id)
1222
+
1223
+ # Add to requests that are waiting to be read and track expiration.
1224
+ for req_id, expiration_time in metadata.reqs_to_send.items():
1225
+ if req_id in self._reqs_to_process:
1226
+ self._reqs_to_send[req_id] = expiration_time
1227
+
1228
+ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
1229
+ logger.debug(
1230
+ "Remote agent %s available, calling _read_blocks for req %s",
1231
+ meta.remote_engine_id, req_id)
1232
+ self._read_blocks(
1233
+ request_id=req_id,
1234
+ dst_engine_id=meta.remote_engine_id,
1235
+ local_block_ids=meta.local_block_ids,
1236
+ remote_block_ids=meta.remote_block_ids,
1237
+ )
1238
+
1239
+ def _read_blocks(self, local_block_ids: list[int],
1240
+ remote_block_ids: list[int], dst_engine_id: str,
1241
+ request_id: str):
1242
+ # NOTE(rob): having the staging blocks be on the READER side is
1243
+ # not going to work well (since we will have to call rearrange tensors).
1244
+ # after we detect the txn is complete (which means we cannot make the
1245
+ # read trxn async easily). If we want to make "READ" happen cleanly,
1246
+ # then we will need to have the staging blocks on the remote side.
1247
+
1248
+ # NOTE(rob): according to nvidia the staging blocks are used to
1249
+ # saturate IB with heterogeneous TP sizes. We should remove the staging
1250
+ # blocks until we are ready.
1251
+
1252
+ # Number of D TP workers that will read from dst P. Propagate tp_ratio
1253
+ # on notification so that dst worker can wait before freeing blocks.
1254
+ tp_ratio = self._tp_size[
1255
+ self.engine_id] // self._tp_size[dst_engine_id]
1256
+ notif_id = f"{request_id}:{tp_ratio}".encode()
1257
+
1258
+ # Full prefix cache hit: do not need to read remote blocks,
1259
+ # just notify P worker that we have the blocks we need.
1260
+ num_local_blocks = len(local_block_ids)
1261
+ if num_local_blocks == 0:
1262
+ remote_rank = self.tp_rank // tp_ratio
1263
+ agent_name = self._remote_agents[dst_engine_id][remote_rank]
1264
+ self.nixl_wrapper.send_notif(agent_name, notif_msg=notif_id)
1265
+ return
1266
+
1267
+ # Partial prefix cache hit: just read uncomputed blocks.
1268
+ num_remote_blocks = len(remote_block_ids)
1269
+ assert num_local_blocks <= num_remote_blocks
1270
+ if num_local_blocks < num_remote_blocks:
1271
+ remote_block_ids = remote_block_ids[-num_local_blocks:]
1272
+
1273
+ # Get side handles.
1274
+ local_xfer_side_handle = self.src_xfer_side_handle
1275
+ remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
1276
+
1277
+ # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
1278
+ # corresponding rank. With heterogeneous TP, fixing D>P, the D tp
1279
+ # workers will issue xfers to parts of the P worker remote kv caches.
1280
+
1281
+ # Get descs ids.
1282
+ local_block_descs_ids: np.ndarray
1283
+ remote_block_descs_ids: np.ndarray
1284
+ if not self.block_window_per_layer:
1285
+ # Default case: assume global attention
1286
+ remote_block_descs_ids = self._get_block_descs_ids(
1287
+ dst_engine_id, remote_block_ids)
1288
+ local_block_descs_ids = self._get_block_descs_ids(
1289
+ self.engine_id, local_block_ids)
1290
+ else:
1291
+ # TODO(mgoin): remove this once we have hybrid memory allocator
1292
+ # Optimization for models with local attention (Llama 4)
1293
+ local_descs_list = []
1294
+ remote_descs_list = []
1295
+ for layer_idx, block_window in enumerate(
1296
+ self.block_window_per_layer):
1297
+ # For each layer:
1298
+ if block_window is None:
1299
+ # If not chunked, we just use the
1300
+ # full block lists (global attention)
1301
+ layer_local_block_ids = local_block_ids
1302
+ layer_remote_block_ids = remote_block_ids
1303
+ else:
1304
+ # If chunked, get the last block_window blocks
1305
+ layer_local_block_ids = local_block_ids[-block_window:]
1306
+ layer_remote_block_ids = remote_block_ids[-block_window:]
1307
+
1308
+ # Get descs ids for the layer.
1309
+ layer_local_desc_ids = self._get_block_descs_ids(
1310
+ self.engine_id, layer_local_block_ids, layer_idx)
1311
+ layer_remote_desc_ids = self._get_block_descs_ids(
1312
+ dst_engine_id, layer_remote_block_ids, layer_idx)
1313
+
1314
+ local_descs_list.append(layer_local_desc_ids)
1315
+ remote_descs_list.append(layer_remote_desc_ids)
1316
+
1317
+ local_block_descs_ids = np.concatenate(local_descs_list)
1318
+ remote_block_descs_ids = np.concatenate(remote_descs_list)
1319
+
1320
+ assert len(local_block_descs_ids) == len(remote_block_descs_ids)
1321
+
1322
+ # Prepare transfer with Nixl.
1323
+ handle = self.nixl_wrapper.make_prepped_xfer(
1324
+ "READ",
1325
+ local_xfer_side_handle,
1326
+ local_block_descs_ids,
1327
+ remote_xfer_side_handle,
1328
+ remote_block_descs_ids,
1329
+ notif_msg=notif_id,
1330
+ )
1331
+
1332
+ # Begin async xfer.
1333
+ self.nixl_wrapper.transfer(handle)
1334
+
1335
+ # Use handle to check completion in future step().
1336
+ self._recving_transfers[request_id].append(
1337
+ (handle, time.perf_counter()))
1338
+
1339
+ def _get_block_descs_ids(self,
1340
+ engine_id: str,
1341
+ block_ids: list[int],
1342
+ layer_idx: Optional[int] = None) -> np.ndarray:
1343
+ """
1344
+ Get the descs ids for a set of block ids.
1345
+ If layer_idx is provided, we use the region_ids for the given layer.
1346
+ Otherwise, we use all regions.
1347
+ """
1348
+ if layer_idx is None:
1349
+ region_ids = np.arange(self.num_regions)
1350
+ else:
1351
+ assert layer_idx < self.num_layers
1352
+ if self.num_layers < self.num_regions:
1353
+ # If we have more regions than layers, we assume that
1354
+ # the regions are organized as [K0, V0, K1, V1, ...]
1355
+ # and we select K_i and V_i
1356
+ assert 2 * self.num_layers == self.num_regions
1357
+ region_ids = np.arange(2 * layer_idx, 2 * layer_idx + 2)
1358
+ else:
1359
+ # Otherwise, we assume we have MLA and select i-th layer
1360
+ assert self.num_layers == self.num_regions
1361
+ region_ids = np.arange(layer_idx, layer_idx + 1)
1362
+
1363
+ num_blocks = self.dst_num_blocks[engine_id]
1364
+
1365
+ # Compute the desc ids for each block.
1366
+ region_ids = region_ids[:, None]
1367
+ block_ids = np.array(block_ids)[None, :]
1368
+ descs_ids = region_ids * num_blocks + block_ids
1369
+ return descs_ids.flatten()
1370
+
1371
+ def get_backend_aware_kv_block_len(self, layer_idx: int):
1372
+ """
1373
+ Get the block length for one K/V element (K and V have the same size).
1374
+
1375
+ For FA and other backends, this is equal to the length of the whole
1376
+ block, as K and V are in separate regions.
1377
+ For FlashInfer, this is half the length of the whole block, as K and V
1378
+ share the same region.
1379
+ """
1380
+ if self._use_flashinfer:
1381
+ # For indexing only half (either just the K or V part).
1382
+ block_len = self.block_len_per_layer[layer_idx] // 2
1383
+ else:
1384
+ block_len = self.block_len_per_layer[layer_idx]
1385
+ return block_len
1386
+
1387
+ def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
1388
+ """
1389
+ Get the KV transfer stats for the connector.
1390
+ """
1391
+ # Clear stats for next iteration
1392
+ if not self.xfer_stats.is_empty():
1393
+ return self.xfer_stats.clone_and_reset()
1394
+ return None
1395
+
1396
+ def shutdown(self):
1397
+ """Shutdown the connector worker."""
1398
+ self._handshake_initiation_executor.shutdown(wait=False)
1399
+ if self._nixl_handshake_listener_t is not None:
1400
+ self._nixl_handshake_listener_t.join(timeout=0)
1401
+ self._nixl_handshake_listener_t = None
1402
+ for handles in self._recving_transfers.values():
1403
+ for handle, _ in handles:
1404
+ self.nixl_wrapper.release_xfer_handle(handle)
1405
+ self._recving_transfers.clear()
1406
+ if self.src_xfer_side_handle:
1407
+ self.nixl_wrapper.release_dlist_handle(self.src_xfer_side_handle)
1408
+ self.src_xfer_side_handle = 0
1409
+ for dst_xfer_side_handle in self.dst_xfer_side_handles.values():
1410
+ self.nixl_wrapper.release_dlist_handle(dst_xfer_side_handle)
1411
+ self.dst_xfer_side_handles.clear()
1412
+ for remote_agents in self._remote_agents.values():
1413
+ for agent_name in remote_agents.values():
1414
+ self.nixl_wrapper.remove_remote_agent(agent_name)
1415
+ self._remote_agents.clear()
1416
+ for desc in self._registered_descs:
1417
+ self.nixl_wrapper.deregister_memory(desc)
1418
+ self._registered_descs.clear()
1419
+
1420
+
1421
+ @contextlib.contextmanager
1422
+ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
1423
+ """Context manager for a ZMQ socket"""
1424
+
1425
+ if socket_type not in (zmq.ROUTER, zmq.REQ):
1426
+ raise ValueError(f"Unexpected socket type: {socket_type}")
1427
+
1428
+ ctx: Optional[zmq.Context] = None
1429
+ try:
1430
+ ctx = zmq.Context() # type: ignore[attr-defined]
1431
+ yield make_zmq_socket(ctx=ctx,
1432
+ path=addr,
1433
+ socket_type=socket_type,
1434
+ bind=socket_type == zmq.ROUTER)
1435
+ finally:
1436
+ if ctx is not None:
1437
+ ctx.destroy(linger=0)
1438
+
1439
+
1440
+ @dataclass
1441
+ class NixlKVConnectorStats(KVConnectorStats):
1442
+ """Container for transfer performance metrics"""
1443
+
1444
+ def __post_init__(self):
1445
+ if "num_successful_transfers" not in self.data:
1446
+ self.data["num_successful_transfers"] = 0
1447
+
1448
+ def reset(self):
1449
+ self.data = {"num_successful_transfers": 0}
1450
+
1451
+ def record_transfer(self):
1452
+ # TODO: record actual transfer stats when available
1453
+ self.data["num_successful_transfers"] += 1
1454
+
1455
+ def clone_and_reset(self) -> "NixlKVConnectorStats":
1456
+ old = copy.copy(self)
1457
+ self.reset()
1458
+ return old
1459
+
1460
+ def is_empty(self) -> bool:
1461
+ return self.data["num_successful_transfers"] == 0
1462
+
1463
+ def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
1464
+ if not other.is_empty():
1465
+ self.data["num_successful_transfers"] += other.data[
1466
+ "num_successful_transfers"]
1467
+ return self
1468
+
1469
+ def reduce(self) -> dict[str, Union[int, float]]:
1470
+ # TODO: reduce stats to a single value, calculate latency/throughput
1471
+ return {
1472
+ "num_successful_transfers": self.data["num_successful_transfers"]
1473
+ }