vllm-cpu 0.9.2.post2__cp311-cp311-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1236) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +214 -0
  3. vllm/_custom_ops.py +1915 -0
  4. vllm/_ipex_ops.py +350 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +139 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +325 -0
  20. vllm/attention/backends/blocksparse_attn.py +465 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1506 -0
  23. vllm/attention/backends/flash_attn.py +1008 -0
  24. vllm/attention/backends/flashinfer.py +1107 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +318 -0
  27. vllm/attention/backends/ipex_attn.py +403 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1391 -0
  30. vllm/attention/backends/pallas.py +356 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +1015 -0
  34. vllm/attention/backends/torch_sdpa.py +707 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +807 -0
  38. vllm/attention/layer.py +481 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +903 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/pallas_kv_cache_update.py +120 -0
  52. vllm/attention/ops/prefix_prefill.py +902 -0
  53. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  54. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  55. vllm/attention/ops/triton_decode_attention.py +674 -0
  56. vllm/attention/ops/triton_flash_attention.py +984 -0
  57. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  58. vllm/attention/ops/triton_unified_attention.py +738 -0
  59. vllm/attention/selector.py +214 -0
  60. vllm/attention/utils/fa_utils.py +72 -0
  61. vllm/beam_search.py +87 -0
  62. vllm/benchmarks/__init__.py +0 -0
  63. vllm/benchmarks/datasets.py +1441 -0
  64. vllm/benchmarks/endpoint_request_func.py +393 -0
  65. vllm/benchmarks/latency.py +168 -0
  66. vllm/benchmarks/serve.py +1063 -0
  67. vllm/benchmarks/throughput.py +609 -0
  68. vllm/benchmarks/utils.py +70 -0
  69. vllm/collect_env.py +820 -0
  70. vllm/compilation/__init__.py +0 -0
  71. vllm/compilation/activation_quant_fusion.py +89 -0
  72. vllm/compilation/backends.py +610 -0
  73. vllm/compilation/base_piecewise_backend.py +72 -0
  74. vllm/compilation/collective_fusion.py +127 -0
  75. vllm/compilation/compiler_interface.py +564 -0
  76. vllm/compilation/counter.py +41 -0
  77. vllm/compilation/cuda_piecewise_backend.py +218 -0
  78. vllm/compilation/decorators.py +250 -0
  79. vllm/compilation/fix_functionalization.py +191 -0
  80. vllm/compilation/fusion.py +645 -0
  81. vllm/compilation/fusion_attn.py +166 -0
  82. vllm/compilation/fx_utils.py +84 -0
  83. vllm/compilation/inductor_pass.py +115 -0
  84. vllm/compilation/monitor.py +39 -0
  85. vllm/compilation/multi_output_match.py +109 -0
  86. vllm/compilation/noop_elimination.py +165 -0
  87. vllm/compilation/pass_manager.py +82 -0
  88. vllm/compilation/sequence_parallelism.py +482 -0
  89. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  90. vllm/compilation/vllm_inductor_pass.py +70 -0
  91. vllm/compilation/wrapper.py +135 -0
  92. vllm/config.py +4913 -0
  93. vllm/connections.py +174 -0
  94. vllm/core/__init__.py +0 -0
  95. vllm/core/block/__init__.py +0 -0
  96. vllm/core/block/block_table.py +399 -0
  97. vllm/core/block/common.py +371 -0
  98. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  99. vllm/core/block/interfaces.py +319 -0
  100. vllm/core/block/naive_block.py +466 -0
  101. vllm/core/block/prefix_caching_block.py +1135 -0
  102. vllm/core/block/utils.py +28 -0
  103. vllm/core/block_manager.py +525 -0
  104. vllm/core/evictor.py +157 -0
  105. vllm/core/interfaces.py +139 -0
  106. vllm/core/placeholder_block_space_manager.py +103 -0
  107. vllm/core/scheduler.py +2126 -0
  108. vllm/device_allocator/__init__.py +0 -0
  109. vllm/device_allocator/cumem.py +281 -0
  110. vllm/distributed/__init__.py +6 -0
  111. vllm/distributed/communication_op.py +41 -0
  112. vllm/distributed/device_communicators/__init__.py +0 -0
  113. vllm/distributed/device_communicators/all2all.py +264 -0
  114. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  115. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  116. vllm/distributed/device_communicators/cuda_communicator.py +194 -0
  117. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  118. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  119. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  120. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  121. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  122. vllm/distributed/device_communicators/pynccl.py +218 -0
  123. vllm/distributed/device_communicators/pynccl_wrapper.py +349 -0
  124. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  125. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  126. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  127. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  128. vllm/distributed/eplb/__init__.py +8 -0
  129. vllm/distributed/eplb/eplb_state.py +432 -0
  130. vllm/distributed/eplb/rebalance_algo.py +234 -0
  131. vllm/distributed/eplb/rebalance_execute.py +307 -0
  132. vllm/distributed/kv_events.py +356 -0
  133. vllm/distributed/kv_transfer/README.md +29 -0
  134. vllm/distributed/kv_transfer/__init__.py +12 -0
  135. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  137. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  138. vllm/distributed/kv_transfer/kv_connector/factory.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  140. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  141. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  142. vllm/distributed/kv_transfer/kv_connector/utils.py +109 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1103 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +485 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +533 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +265 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +389 -0
  153. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  154. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  155. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  156. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  158. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  159. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  160. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  161. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  162. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  163. vllm/distributed/parallel_state.py +1385 -0
  164. vllm/distributed/tpu_distributed_utils.py +178 -0
  165. vllm/distributed/utils.py +536 -0
  166. vllm/engine/__init__.py +0 -0
  167. vllm/engine/arg_utils.py +1801 -0
  168. vllm/engine/async_llm_engine.py +1200 -0
  169. vllm/engine/async_timeout.py +173 -0
  170. vllm/engine/llm_engine.py +2101 -0
  171. vllm/engine/metrics.py +629 -0
  172. vllm/engine/metrics_types.py +94 -0
  173. vllm/engine/multiprocessing/__init__.py +148 -0
  174. vllm/engine/multiprocessing/client.py +681 -0
  175. vllm/engine/multiprocessing/engine.py +460 -0
  176. vllm/engine/output_processor/__init__.py +0 -0
  177. vllm/engine/output_processor/interfaces.py +75 -0
  178. vllm/engine/output_processor/multi_step.py +216 -0
  179. vllm/engine/output_processor/single_step.py +145 -0
  180. vllm/engine/output_processor/stop_checker.py +131 -0
  181. vllm/engine/output_processor/util.py +28 -0
  182. vllm/engine/protocol.py +326 -0
  183. vllm/entrypoints/__init__.py +0 -0
  184. vllm/entrypoints/api_server.py +178 -0
  185. vllm/entrypoints/chat_utils.py +1278 -0
  186. vllm/entrypoints/cli/__init__.py +12 -0
  187. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  188. vllm/entrypoints/cli/benchmark/base.py +25 -0
  189. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  190. vllm/entrypoints/cli/benchmark/main.py +58 -0
  191. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  192. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  193. vllm/entrypoints/cli/collect_env.py +36 -0
  194. vllm/entrypoints/cli/main.py +71 -0
  195. vllm/entrypoints/cli/openai.py +201 -0
  196. vllm/entrypoints/cli/run_batch.py +69 -0
  197. vllm/entrypoints/cli/serve.py +265 -0
  198. vllm/entrypoints/cli/types.py +29 -0
  199. vllm/entrypoints/launcher.py +147 -0
  200. vllm/entrypoints/llm.py +1599 -0
  201. vllm/entrypoints/logger.py +50 -0
  202. vllm/entrypoints/openai/__init__.py +0 -0
  203. vllm/entrypoints/openai/api_server.py +1495 -0
  204. vllm/entrypoints/openai/cli_args.py +331 -0
  205. vllm/entrypoints/openai/logits_processors.py +90 -0
  206. vllm/entrypoints/openai/protocol.py +2096 -0
  207. vllm/entrypoints/openai/run_batch.py +473 -0
  208. vllm/entrypoints/openai/serving_chat.py +1258 -0
  209. vllm/entrypoints/openai/serving_classification.py +160 -0
  210. vllm/entrypoints/openai/serving_completion.py +618 -0
  211. vllm/entrypoints/openai/serving_embedding.py +201 -0
  212. vllm/entrypoints/openai/serving_engine.py +988 -0
  213. vllm/entrypoints/openai/serving_models.py +315 -0
  214. vllm/entrypoints/openai/serving_pooling.py +234 -0
  215. vllm/entrypoints/openai/serving_score.py +431 -0
  216. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  217. vllm/entrypoints/openai/serving_transcription.py +132 -0
  218. vllm/entrypoints/openai/speech_to_text.py +395 -0
  219. vllm/entrypoints/openai/tool_parsers/__init__.py +25 -0
  220. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  221. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  222. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  223. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  224. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  225. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  226. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  227. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  228. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  229. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +369 -0
  230. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  231. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  232. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  233. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  234. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +466 -0
  235. vllm/entrypoints/score_utils.py +50 -0
  236. vllm/entrypoints/ssl.py +75 -0
  237. vllm/entrypoints/utils.py +262 -0
  238. vllm/env_override.py +41 -0
  239. vllm/envs.py +1029 -0
  240. vllm/executor/__init__.py +0 -0
  241. vllm/executor/executor_base.py +401 -0
  242. vllm/executor/mp_distributed_executor.py +244 -0
  243. vllm/executor/msgspec_utils.py +30 -0
  244. vllm/executor/multiproc_worker_utils.py +313 -0
  245. vllm/executor/ray_distributed_executor.py +701 -0
  246. vllm/executor/ray_utils.py +399 -0
  247. vllm/executor/uniproc_executor.py +139 -0
  248. vllm/forward_context.py +185 -0
  249. vllm/inputs/__init__.py +41 -0
  250. vllm/inputs/data.py +331 -0
  251. vllm/inputs/parse.py +151 -0
  252. vllm/inputs/preprocess.py +924 -0
  253. vllm/inputs/registry.py +245 -0
  254. vllm/jsontree.py +80 -0
  255. vllm/logger.py +212 -0
  256. vllm/logging_utils/__init__.py +8 -0
  257. vllm/logging_utils/dump_input.py +81 -0
  258. vllm/logging_utils/formatter.py +18 -0
  259. vllm/logits_process.py +119 -0
  260. vllm/lora/__init__.py +0 -0
  261. vllm/lora/fully_sharded_layers.py +355 -0
  262. vllm/lora/layers.py +1285 -0
  263. vllm/lora/lora.py +199 -0
  264. vllm/lora/models.py +818 -0
  265. vllm/lora/ops/__init__.py +0 -0
  266. vllm/lora/ops/torch_ops/__init__.py +16 -0
  267. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  268. vllm/lora/ops/triton_ops/__init__.py +12 -0
  269. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  270. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  271. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  272. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  273. vllm/lora/ops/triton_ops/utils.py +120 -0
  274. vllm/lora/ops/xla_ops/__init__.py +7 -0
  275. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  276. vllm/lora/peft_helper.py +136 -0
  277. vllm/lora/punica_wrapper/__init__.py +10 -0
  278. vllm/lora/punica_wrapper/punica_base.py +485 -0
  279. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  280. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  281. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  284. vllm/lora/punica_wrapper/utils.py +164 -0
  285. vllm/lora/request.py +99 -0
  286. vllm/lora/resolver.py +85 -0
  287. vllm/lora/utils.py +240 -0
  288. vllm/lora/worker_manager.py +256 -0
  289. vllm/model_executor/__init__.py +16 -0
  290. vllm/model_executor/custom_op.py +208 -0
  291. vllm/model_executor/guided_decoding/__init__.py +181 -0
  292. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  293. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  294. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  295. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  296. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  297. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  298. vllm/model_executor/guided_decoding/utils.py +242 -0
  299. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  300. vllm/model_executor/layers/__init__.py +0 -0
  301. vllm/model_executor/layers/activation.py +420 -0
  302. vllm/model_executor/layers/fused_moe/__init__.py +78 -0
  303. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +298 -0
  304. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +140 -0
  305. vllm/model_executor/layers/fused_moe/config.py +456 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  475. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +215 -0
  476. vllm/model_executor/layers/fused_moe/cutlass_moe.py +645 -0
  477. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +250 -0
  478. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +231 -0
  479. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +183 -0
  480. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1021 -0
  481. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +234 -0
  482. vllm/model_executor/layers/fused_moe/fused_moe.py +1734 -0
  483. vllm/model_executor/layers/fused_moe/layer.py +1528 -0
  484. vllm/model_executor/layers/fused_moe/modular_kernel.py +598 -0
  485. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +224 -0
  486. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  487. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  488. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  489. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +233 -0
  490. vllm/model_executor/layers/fused_moe/prepare_finalize.py +66 -0
  491. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +429 -0
  492. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +136 -0
  493. vllm/model_executor/layers/fused_moe/utils.py +144 -0
  494. vllm/model_executor/layers/layernorm.py +287 -0
  495. vllm/model_executor/layers/lightning_attn.py +652 -0
  496. vllm/model_executor/layers/linear.py +1547 -0
  497. vllm/model_executor/layers/logits_processor.py +197 -0
  498. vllm/model_executor/layers/mamba/__init__.py +0 -0
  499. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  500. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  501. vllm/model_executor/layers/mamba/mamba_mixer2.py +731 -0
  502. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  503. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  504. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  505. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  506. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  507. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  508. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  509. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  510. vllm/model_executor/layers/pooler.py +473 -0
  511. vllm/model_executor/layers/quantization/__init__.py +160 -0
  512. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  513. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  514. vllm/model_executor/layers/quantization/awq.py +228 -0
  515. vllm/model_executor/layers/quantization/awq_marlin.py +523 -0
  516. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  517. vllm/model_executor/layers/quantization/base_config.py +164 -0
  518. vllm/model_executor/layers/quantization/bitblas.py +462 -0
  519. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  520. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  521. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +694 -0
  522. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1613 -0
  523. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  524. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  525. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  526. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  527. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  528. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +149 -0
  529. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  530. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  531. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  532. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  533. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  534. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  535. vllm/model_executor/layers/quantization/deepgemm.py +83 -0
  536. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  537. vllm/model_executor/layers/quantization/experts_int8.py +204 -0
  538. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  539. vllm/model_executor/layers/quantization/fp8.py +950 -0
  540. vllm/model_executor/layers/quantization/gguf.py +577 -0
  541. vllm/model_executor/layers/quantization/gptq.py +278 -0
  542. vllm/model_executor/layers/quantization/gptq_bitblas.py +446 -0
  543. vllm/model_executor/layers/quantization/gptq_marlin.py +679 -0
  544. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  545. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  546. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  547. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  548. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  549. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  550. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  551. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  552. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  553. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +132 -0
  554. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  555. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  556. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  557. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  558. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  559. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  560. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  561. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  562. vllm/model_executor/layers/quantization/marlin.py +263 -0
  563. vllm/model_executor/layers/quantization/modelopt.py +747 -0
  564. vllm/model_executor/layers/quantization/moe_wna16.py +457 -0
  565. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  566. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  567. vllm/model_executor/layers/quantization/qqq.py +275 -0
  568. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  569. vllm/model_executor/layers/quantization/quark/quark.py +437 -0
  570. vllm/model_executor/layers/quantization/quark/quark_moe.py +245 -0
  571. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  572. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  573. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  574. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +157 -0
  575. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  576. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  577. vllm/model_executor/layers/quantization/rtn.py +289 -0
  578. vllm/model_executor/layers/quantization/schema.py +86 -0
  579. vllm/model_executor/layers/quantization/torchao.py +212 -0
  580. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  581. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  582. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  583. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/fp8_utils.py +653 -0
  787. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  788. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  789. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  790. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  791. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  792. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  793. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  794. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  795. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  796. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  797. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  798. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +146 -0
  799. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  800. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  801. vllm/model_executor/layers/rejection_sampler.py +406 -0
  802. vllm/model_executor/layers/resampler.py +270 -0
  803. vllm/model_executor/layers/rotary_embedding.py +2025 -0
  804. vllm/model_executor/layers/sampler.py +1204 -0
  805. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  806. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  807. vllm/model_executor/layers/utils.py +116 -0
  808. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  809. vllm/model_executor/model_loader/__init__.py +77 -0
  810. vllm/model_executor/model_loader/base_loader.py +43 -0
  811. vllm/model_executor/model_loader/bitsandbytes_loader.py +613 -0
  812. vllm/model_executor/model_loader/default_loader.py +282 -0
  813. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  814. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  815. vllm/model_executor/model_loader/neuron.py +476 -0
  816. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  817. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  818. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  819. vllm/model_executor/model_loader/tensorizer.py +602 -0
  820. vllm/model_executor/model_loader/tensorizer_loader.py +127 -0
  821. vllm/model_executor/model_loader/tpu.py +113 -0
  822. vllm/model_executor/model_loader/utils.py +315 -0
  823. vllm/model_executor/model_loader/weight_utils.py +782 -0
  824. vllm/model_executor/models/__init__.py +30 -0
  825. vllm/model_executor/models/adapters.py +375 -0
  826. vllm/model_executor/models/aimv2.py +246 -0
  827. vllm/model_executor/models/arctic.py +559 -0
  828. vllm/model_executor/models/aria.py +670 -0
  829. vllm/model_executor/models/aya_vision.py +486 -0
  830. vllm/model_executor/models/baichuan.py +474 -0
  831. vllm/model_executor/models/bamba.py +558 -0
  832. vllm/model_executor/models/bart.py +938 -0
  833. vllm/model_executor/models/bert.py +513 -0
  834. vllm/model_executor/models/bert_with_rope.py +617 -0
  835. vllm/model_executor/models/blip.py +339 -0
  836. vllm/model_executor/models/blip2.py +728 -0
  837. vllm/model_executor/models/bloom.py +373 -0
  838. vllm/model_executor/models/chameleon.py +1146 -0
  839. vllm/model_executor/models/chatglm.py +478 -0
  840. vllm/model_executor/models/clip.py +407 -0
  841. vllm/model_executor/models/commandr.py +471 -0
  842. vllm/model_executor/models/config.py +200 -0
  843. vllm/model_executor/models/constant_size_cache.py +137 -0
  844. vllm/model_executor/models/dbrx.py +472 -0
  845. vllm/model_executor/models/deepseek.py +486 -0
  846. vllm/model_executor/models/deepseek_mtp.py +281 -0
  847. vllm/model_executor/models/deepseek_v2.py +935 -0
  848. vllm/model_executor/models/deepseek_vl2.py +660 -0
  849. vllm/model_executor/models/dots1.py +536 -0
  850. vllm/model_executor/models/eagle.py +261 -0
  851. vllm/model_executor/models/ernie45.py +43 -0
  852. vllm/model_executor/models/ernie45_moe.py +583 -0
  853. vllm/model_executor/models/exaone.py +551 -0
  854. vllm/model_executor/models/fairseq2_llama.py +154 -0
  855. vllm/model_executor/models/falcon.py +510 -0
  856. vllm/model_executor/models/falcon_h1.py +708 -0
  857. vllm/model_executor/models/florence2.py +1113 -0
  858. vllm/model_executor/models/fuyu.py +406 -0
  859. vllm/model_executor/models/gemma.py +427 -0
  860. vllm/model_executor/models/gemma2.py +427 -0
  861. vllm/model_executor/models/gemma3.py +535 -0
  862. vllm/model_executor/models/gemma3_mm.py +729 -0
  863. vllm/model_executor/models/gemma3n.py +811 -0
  864. vllm/model_executor/models/glm.py +23 -0
  865. vllm/model_executor/models/glm4.py +305 -0
  866. vllm/model_executor/models/glm4_1v.py +1590 -0
  867. vllm/model_executor/models/glm4v.py +657 -0
  868. vllm/model_executor/models/gpt2.py +382 -0
  869. vllm/model_executor/models/gpt_bigcode.py +335 -0
  870. vllm/model_executor/models/gpt_j.py +339 -0
  871. vllm/model_executor/models/gpt_neox.py +332 -0
  872. vllm/model_executor/models/granite.py +493 -0
  873. vllm/model_executor/models/granite_speech.py +790 -0
  874. vllm/model_executor/models/granitemoe.py +437 -0
  875. vllm/model_executor/models/granitemoehybrid.py +653 -0
  876. vllm/model_executor/models/granitemoeshared.py +341 -0
  877. vllm/model_executor/models/gritlm.py +224 -0
  878. vllm/model_executor/models/grok1.py +546 -0
  879. vllm/model_executor/models/h2ovl.py +549 -0
  880. vllm/model_executor/models/hunyuan_v1_moe.py +897 -0
  881. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  882. vllm/model_executor/models/idefics3.py +786 -0
  883. vllm/model_executor/models/interfaces.py +681 -0
  884. vllm/model_executor/models/interfaces_base.py +164 -0
  885. vllm/model_executor/models/intern_vit.py +480 -0
  886. vllm/model_executor/models/internlm2.py +455 -0
  887. vllm/model_executor/models/internlm2_ve.py +147 -0
  888. vllm/model_executor/models/internvl.py +1432 -0
  889. vllm/model_executor/models/jais.py +373 -0
  890. vllm/model_executor/models/jamba.py +592 -0
  891. vllm/model_executor/models/keye.py +1736 -0
  892. vllm/model_executor/models/kimi_vl.py +585 -0
  893. vllm/model_executor/models/llama.py +644 -0
  894. vllm/model_executor/models/llama4.py +531 -0
  895. vllm/model_executor/models/llama_eagle.py +165 -0
  896. vllm/model_executor/models/llama_eagle3.py +263 -0
  897. vllm/model_executor/models/llava.py +887 -0
  898. vllm/model_executor/models/llava_next.py +604 -0
  899. vllm/model_executor/models/llava_next_video.py +492 -0
  900. vllm/model_executor/models/llava_onevision.py +985 -0
  901. vllm/model_executor/models/mamba.py +273 -0
  902. vllm/model_executor/models/mamba2.py +320 -0
  903. vllm/model_executor/models/mamba_cache.py +76 -0
  904. vllm/model_executor/models/medusa.py +219 -0
  905. vllm/model_executor/models/mimo.py +192 -0
  906. vllm/model_executor/models/mimo_mtp.py +285 -0
  907. vllm/model_executor/models/minicpm.py +592 -0
  908. vllm/model_executor/models/minicpm3.py +230 -0
  909. vllm/model_executor/models/minicpm_eagle.py +391 -0
  910. vllm/model_executor/models/minicpmo.py +772 -0
  911. vllm/model_executor/models/minicpmv.py +1307 -0
  912. vllm/model_executor/models/minimax_cache.py +36 -0
  913. vllm/model_executor/models/minimax_text_01.py +1301 -0
  914. vllm/model_executor/models/minimax_vl_01.py +374 -0
  915. vllm/model_executor/models/mistral3.py +624 -0
  916. vllm/model_executor/models/mixtral.py +488 -0
  917. vllm/model_executor/models/mixtral_quant.py +453 -0
  918. vllm/model_executor/models/mllama.py +1682 -0
  919. vllm/model_executor/models/mllama4.py +947 -0
  920. vllm/model_executor/models/mlp_speculator.py +206 -0
  921. vllm/model_executor/models/modernbert.py +339 -0
  922. vllm/model_executor/models/module_mapping.py +72 -0
  923. vllm/model_executor/models/molmo.py +1576 -0
  924. vllm/model_executor/models/moonvit.py +630 -0
  925. vllm/model_executor/models/mpt.py +331 -0
  926. vllm/model_executor/models/nemotron.py +508 -0
  927. vllm/model_executor/models/nemotron_h.py +588 -0
  928. vllm/model_executor/models/nemotron_nas.py +484 -0
  929. vllm/model_executor/models/nvlm_d.py +216 -0
  930. vllm/model_executor/models/olmo.py +389 -0
  931. vllm/model_executor/models/olmo2.py +414 -0
  932. vllm/model_executor/models/olmoe.py +468 -0
  933. vllm/model_executor/models/opt.py +412 -0
  934. vllm/model_executor/models/orion.py +349 -0
  935. vllm/model_executor/models/ovis.py +577 -0
  936. vllm/model_executor/models/paligemma.py +419 -0
  937. vllm/model_executor/models/persimmon.py +344 -0
  938. vllm/model_executor/models/phi.py +356 -0
  939. vllm/model_executor/models/phi3.py +19 -0
  940. vllm/model_executor/models/phi3_small.py +465 -0
  941. vllm/model_executor/models/phi3v.py +733 -0
  942. vllm/model_executor/models/phi4mm.py +1258 -0
  943. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  944. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  945. vllm/model_executor/models/phimoe.py +674 -0
  946. vllm/model_executor/models/pixtral.py +1329 -0
  947. vllm/model_executor/models/plamo2.py +738 -0
  948. vllm/model_executor/models/prithvi_geospatial_mae.py +240 -0
  949. vllm/model_executor/models/qwen.py +362 -0
  950. vllm/model_executor/models/qwen2.py +501 -0
  951. vllm/model_executor/models/qwen2_5_omni_thinker.py +923 -0
  952. vllm/model_executor/models/qwen2_5_vl.py +1175 -0
  953. vllm/model_executor/models/qwen2_audio.py +420 -0
  954. vllm/model_executor/models/qwen2_moe.py +540 -0
  955. vllm/model_executor/models/qwen2_rm.py +122 -0
  956. vllm/model_executor/models/qwen2_vl.py +1513 -0
  957. vllm/model_executor/models/qwen3.py +325 -0
  958. vllm/model_executor/models/qwen3_moe.py +541 -0
  959. vllm/model_executor/models/qwen_vl.py +796 -0
  960. vllm/model_executor/models/registry.py +634 -0
  961. vllm/model_executor/models/roberta.py +271 -0
  962. vllm/model_executor/models/siglip.py +524 -0
  963. vllm/model_executor/models/skyworkr1v.py +961 -0
  964. vllm/model_executor/models/smolvlm.py +52 -0
  965. vllm/model_executor/models/solar.py +506 -0
  966. vllm/model_executor/models/stablelm.py +343 -0
  967. vllm/model_executor/models/starcoder2.py +356 -0
  968. vllm/model_executor/models/tarsier.py +652 -0
  969. vllm/model_executor/models/telechat2.py +140 -0
  970. vllm/model_executor/models/teleflm.py +79 -0
  971. vllm/model_executor/models/transformers.py +509 -0
  972. vllm/model_executor/models/ultravox.py +670 -0
  973. vllm/model_executor/models/utils.py +744 -0
  974. vllm/model_executor/models/vision.py +147 -0
  975. vllm/model_executor/models/whisper.py +886 -0
  976. vllm/model_executor/models/zamba2.py +1036 -0
  977. vllm/model_executor/parameter.py +459 -0
  978. vllm/model_executor/pooling_metadata.py +72 -0
  979. vllm/model_executor/sampling_metadata.py +597 -0
  980. vllm/model_executor/utils.py +80 -0
  981. vllm/multimodal/__init__.py +33 -0
  982. vllm/multimodal/audio.py +116 -0
  983. vllm/multimodal/base.py +219 -0
  984. vllm/multimodal/hasher.py +91 -0
  985. vllm/multimodal/image.py +103 -0
  986. vllm/multimodal/inputs.py +878 -0
  987. vllm/multimodal/parse.py +499 -0
  988. vllm/multimodal/processing.py +1948 -0
  989. vllm/multimodal/profiling.py +283 -0
  990. vllm/multimodal/registry.py +331 -0
  991. vllm/multimodal/utils.py +492 -0
  992. vllm/multimodal/video.py +227 -0
  993. vllm/outputs.py +516 -0
  994. vllm/platforms/__init__.py +291 -0
  995. vllm/platforms/cpu.py +281 -0
  996. vllm/platforms/cuda.py +568 -0
  997. vllm/platforms/hpu.py +106 -0
  998. vllm/platforms/interface.py +551 -0
  999. vllm/platforms/neuron.py +150 -0
  1000. vllm/platforms/rocm.py +453 -0
  1001. vllm/platforms/tpu.py +206 -0
  1002. vllm/platforms/xpu.py +192 -0
  1003. vllm/plugins/__init__.py +94 -0
  1004. vllm/plugins/lora_resolvers/README.md +15 -0
  1005. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1006. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1007. vllm/pooling_params.py +64 -0
  1008. vllm/profiler/__init__.py +0 -0
  1009. vllm/profiler/layerwise_profile.py +375 -0
  1010. vllm/profiler/utils.py +148 -0
  1011. vllm/prompt_adapter/__init__.py +0 -0
  1012. vllm/prompt_adapter/layers.py +83 -0
  1013. vllm/prompt_adapter/models.py +358 -0
  1014. vllm/prompt_adapter/request.py +37 -0
  1015. vllm/prompt_adapter/utils.py +98 -0
  1016. vllm/prompt_adapter/worker_manager.py +179 -0
  1017. vllm/py.typed +2 -0
  1018. vllm/reasoning/__init__.py +15 -0
  1019. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  1020. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1021. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1022. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1023. vllm/sampling_params.py +602 -0
  1024. vllm/scalar_type.py +347 -0
  1025. vllm/scripts.py +15 -0
  1026. vllm/sequence.py +1568 -0
  1027. vllm/spec_decode/__init__.py +0 -0
  1028. vllm/spec_decode/batch_expansion.py +506 -0
  1029. vllm/spec_decode/draft_model_runner.py +349 -0
  1030. vllm/spec_decode/interfaces.py +99 -0
  1031. vllm/spec_decode/medusa_worker.py +138 -0
  1032. vllm/spec_decode/metrics.py +213 -0
  1033. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1034. vllm/spec_decode/mqa_scorer.py +160 -0
  1035. vllm/spec_decode/multi_step_worker.py +423 -0
  1036. vllm/spec_decode/ngram_worker.py +196 -0
  1037. vllm/spec_decode/proposer_worker_base.py +59 -0
  1038. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1039. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1040. vllm/spec_decode/target_model_runner.py +45 -0
  1041. vllm/spec_decode/top1_proposer.py +275 -0
  1042. vllm/spec_decode/util.py +277 -0
  1043. vllm/test_utils.py +130 -0
  1044. vllm/third_party/__init__.py +0 -0
  1045. vllm/third_party/pynvml.py +6140 -0
  1046. vllm/tracing.py +131 -0
  1047. vllm/transformers_utils/__init__.py +24 -0
  1048. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1049. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1050. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1051. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1052. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1053. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1054. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1055. vllm/transformers_utils/config.py +922 -0
  1056. vllm/transformers_utils/configs/__init__.py +57 -0
  1057. vllm/transformers_utils/configs/arctic.py +207 -0
  1058. vllm/transformers_utils/configs/chatglm.py +72 -0
  1059. vllm/transformers_utils/configs/cohere2.py +195 -0
  1060. vllm/transformers_utils/configs/dbrx.py +280 -0
  1061. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1062. vllm/transformers_utils/configs/eagle.py +85 -0
  1063. vllm/transformers_utils/configs/exaone.py +190 -0
  1064. vllm/transformers_utils/configs/falcon.py +90 -0
  1065. vllm/transformers_utils/configs/jais.py +238 -0
  1066. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1067. vllm/transformers_utils/configs/medusa.py +63 -0
  1068. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1069. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1070. vllm/transformers_utils/configs/mllama.py +31 -0
  1071. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1072. vllm/transformers_utils/configs/moonvit.py +33 -0
  1073. vllm/transformers_utils/configs/mpt.py +180 -0
  1074. vllm/transformers_utils/configs/nemotron.py +205 -0
  1075. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1076. vllm/transformers_utils/configs/nvlm_d.py +31 -0
  1077. vllm/transformers_utils/configs/ovis.py +184 -0
  1078. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1079. vllm/transformers_utils/configs/solar.py +247 -0
  1080. vllm/transformers_utils/configs/telechat2.py +64 -0
  1081. vllm/transformers_utils/configs/ultravox.py +108 -0
  1082. vllm/transformers_utils/detokenizer.py +168 -0
  1083. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1084. vllm/transformers_utils/processor.py +221 -0
  1085. vllm/transformers_utils/processors/__init__.py +8 -0
  1086. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1087. vllm/transformers_utils/processors/ovis.py +420 -0
  1088. vllm/transformers_utils/s3_utils.py +162 -0
  1089. vllm/transformers_utils/tokenizer.py +302 -0
  1090. vllm/transformers_utils/tokenizer_base.py +149 -0
  1091. vllm/transformers_utils/tokenizer_group.py +120 -0
  1092. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1093. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1094. vllm/transformers_utils/utils.py +99 -0
  1095. vllm/triton_utils/__init__.py +14 -0
  1096. vllm/triton_utils/importing.py +94 -0
  1097. vllm/usage/__init__.py +0 -0
  1098. vllm/usage/usage_lib.py +259 -0
  1099. vllm/utils/__init__.py +3008 -0
  1100. vllm/v1/__init__.py +0 -0
  1101. vllm/v1/attention/__init__.py +0 -0
  1102. vllm/v1/attention/backends/__init__.py +0 -0
  1103. vllm/v1/attention/backends/cpu_attn.py +184 -0
  1104. vllm/v1/attention/backends/flash_attn.py +757 -0
  1105. vllm/v1/attention/backends/flashinfer.py +680 -0
  1106. vllm/v1/attention/backends/flex_attention.py +491 -0
  1107. vllm/v1/attention/backends/mamba_attn.py +192 -0
  1108. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1109. vllm/v1/attention/backends/mla/common.py +978 -0
  1110. vllm/v1/attention/backends/mla/cutlass_mla.py +98 -0
  1111. vllm/v1/attention/backends/mla/flashmla.py +180 -0
  1112. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +241 -0
  1113. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1114. vllm/v1/attention/backends/pallas.py +320 -0
  1115. vllm/v1/attention/backends/rocm_aiter_fa.py +609 -0
  1116. vllm/v1/attention/backends/triton_attn.py +449 -0
  1117. vllm/v1/attention/backends/utils.py +310 -0
  1118. vllm/v1/core/__init__.py +0 -0
  1119. vllm/v1/core/block_pool.py +349 -0
  1120. vllm/v1/core/encoder_cache_manager.py +254 -0
  1121. vllm/v1/core/kv_cache_coordinator.py +369 -0
  1122. vllm/v1/core/kv_cache_manager.py +398 -0
  1123. vllm/v1/core/kv_cache_utils.py +999 -0
  1124. vllm/v1/core/sched/__init__.py +0 -0
  1125. vllm/v1/core/sched/interface.py +150 -0
  1126. vllm/v1/core/sched/output.py +157 -0
  1127. vllm/v1/core/sched/request_queue.py +224 -0
  1128. vllm/v1/core/sched/scheduler.py +1115 -0
  1129. vllm/v1/core/sched/utils.py +36 -0
  1130. vllm/v1/core/single_type_kv_cache_manager.py +444 -0
  1131. vllm/v1/engine/__init__.py +179 -0
  1132. vllm/v1/engine/async_llm.py +626 -0
  1133. vllm/v1/engine/coordinator.py +278 -0
  1134. vllm/v1/engine/core.py +1046 -0
  1135. vllm/v1/engine/core_client.py +1049 -0
  1136. vllm/v1/engine/detokenizer.py +292 -0
  1137. vllm/v1/engine/exceptions.py +17 -0
  1138. vllm/v1/engine/llm_engine.py +322 -0
  1139. vllm/v1/engine/logprobs.py +200 -0
  1140. vllm/v1/engine/mm_input_cache.py +91 -0
  1141. vllm/v1/engine/output_processor.py +477 -0
  1142. vllm/v1/engine/parallel_sampling.py +133 -0
  1143. vllm/v1/engine/processor.py +422 -0
  1144. vllm/v1/engine/utils.py +546 -0
  1145. vllm/v1/executor/__init__.py +0 -0
  1146. vllm/v1/executor/abstract.py +113 -0
  1147. vllm/v1/executor/multiproc_executor.py +532 -0
  1148. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1149. vllm/v1/kv_cache_interface.py +223 -0
  1150. vllm/v1/metrics/__init__.py +0 -0
  1151. vllm/v1/metrics/loggers.py +557 -0
  1152. vllm/v1/metrics/prometheus.py +82 -0
  1153. vllm/v1/metrics/ray_wrappers.py +131 -0
  1154. vllm/v1/metrics/reader.py +246 -0
  1155. vllm/v1/metrics/stats.py +240 -0
  1156. vllm/v1/outputs.py +124 -0
  1157. vllm/v1/pool/__init__.py +0 -0
  1158. vllm/v1/pool/metadata.py +17 -0
  1159. vllm/v1/request.py +229 -0
  1160. vllm/v1/sample/__init__.py +0 -0
  1161. vllm/v1/sample/logits_processor.py +517 -0
  1162. vllm/v1/sample/metadata.py +43 -0
  1163. vllm/v1/sample/ops/__init__.py +0 -0
  1164. vllm/v1/sample/ops/bad_words.py +39 -0
  1165. vllm/v1/sample/ops/penalties.py +43 -0
  1166. vllm/v1/sample/ops/topk_topp_sampler.py +296 -0
  1167. vllm/v1/sample/rejection_sampler.py +631 -0
  1168. vllm/v1/sample/sampler.py +226 -0
  1169. vllm/v1/sample/tpu/__init__.py +0 -0
  1170. vllm/v1/sample/tpu/metadata.py +124 -0
  1171. vllm/v1/sample/tpu/sampler.py +145 -0
  1172. vllm/v1/serial_utils.py +315 -0
  1173. vllm/v1/spec_decode/__init__.py +0 -0
  1174. vllm/v1/spec_decode/eagle.py +441 -0
  1175. vllm/v1/spec_decode/medusa.py +64 -0
  1176. vllm/v1/spec_decode/metadata.py +62 -0
  1177. vllm/v1/spec_decode/metrics.py +178 -0
  1178. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1179. vllm/v1/spec_decode/utils.py +41 -0
  1180. vllm/v1/structured_output/__init__.py +227 -0
  1181. vllm/v1/structured_output/backend_guidance.py +245 -0
  1182. vllm/v1/structured_output/backend_types.py +134 -0
  1183. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1184. vllm/v1/structured_output/request.py +86 -0
  1185. vllm/v1/structured_output/utils.py +175 -0
  1186. vllm/v1/utils.py +377 -0
  1187. vllm/v1/worker/__init__.py +0 -0
  1188. vllm/v1/worker/block_table.py +142 -0
  1189. vllm/v1/worker/cpu_model_runner.py +91 -0
  1190. vllm/v1/worker/cpu_worker.py +153 -0
  1191. vllm/v1/worker/gpu_input_batch.py +757 -0
  1192. vllm/v1/worker/gpu_model_runner.py +2739 -0
  1193. vllm/v1/worker/gpu_worker.py +408 -0
  1194. vllm/v1/worker/lora_model_runner_mixin.py +177 -0
  1195. vllm/v1/worker/tpu_input_batch.py +585 -0
  1196. vllm/v1/worker/tpu_model_runner.py +1849 -0
  1197. vllm/v1/worker/tpu_worker.py +315 -0
  1198. vllm/v1/worker/utils.py +112 -0
  1199. vllm/v1/worker/worker_base.py +65 -0
  1200. vllm/v1/worker/xpu_model_runner.py +33 -0
  1201. vllm/v1/worker/xpu_worker.py +165 -0
  1202. vllm/version.py +41 -0
  1203. vllm/vllm_flash_attn/.gitkeep +0 -0
  1204. vllm/worker/__init__.py +0 -0
  1205. vllm/worker/cache_engine.py +145 -0
  1206. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1207. vllm/worker/cpu_model_runner.py +671 -0
  1208. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1209. vllm/worker/cpu_worker.py +452 -0
  1210. vllm/worker/enc_dec_model_runner.py +555 -0
  1211. vllm/worker/hpu_model_runner.py +2320 -0
  1212. vllm/worker/hpu_worker.py +484 -0
  1213. vllm/worker/model_runner.py +2178 -0
  1214. vllm/worker/model_runner_base.py +282 -0
  1215. vllm/worker/multi_step_hpu_worker.py +123 -0
  1216. vllm/worker/multi_step_model_runner.py +911 -0
  1217. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1218. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1219. vllm/worker/multi_step_tpu_worker.py +108 -0
  1220. vllm/worker/multi_step_worker.py +197 -0
  1221. vllm/worker/neuron_model_runner.py +460 -0
  1222. vllm/worker/neuron_worker.py +193 -0
  1223. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1224. vllm/worker/pooling_model_runner.py +211 -0
  1225. vllm/worker/tpu_model_runner.py +909 -0
  1226. vllm/worker/tpu_worker.py +337 -0
  1227. vllm/worker/utils.py +53 -0
  1228. vllm/worker/worker.py +577 -0
  1229. vllm/worker/worker_base.py +646 -0
  1230. vllm/worker/xpu_model_runner.py +606 -0
  1231. vllm/worker/xpu_worker.py +186 -0
  1232. vllm_cpu-0.9.2.post2.dist-info/METADATA +339 -0
  1233. vllm_cpu-0.9.2.post2.dist-info/RECORD +1236 -0
  1234. vllm_cpu-0.9.2.post2.dist-info/WHEEL +5 -0
  1235. vllm_cpu-0.9.2.post2.dist-info/entry_points.txt +5 -0
  1236. vllm_cpu-0.9.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1278 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ import asyncio
5
+ import json
6
+ from abc import ABC, abstractmethod
7
+ from collections import defaultdict, deque
8
+ from collections.abc import Awaitable, Iterable
9
+ from functools import cached_property, lru_cache, partial
10
+ from pathlib import Path
11
+ from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
12
+ cast)
13
+
14
+ import jinja2.nodes
15
+ import transformers.utils.chat_template_utils as hf_chat_utils
16
+ # yapf conflicts with isort for this block
17
+ # yapf: disable
18
+ from openai.types.chat import (ChatCompletionAssistantMessageParam,
19
+ ChatCompletionContentPartImageParam,
20
+ ChatCompletionContentPartInputAudioParam)
21
+ from openai.types.chat import (
22
+ ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
23
+ from openai.types.chat import (ChatCompletionContentPartRefusalParam,
24
+ ChatCompletionContentPartTextParam)
25
+ from openai.types.chat import (
26
+ ChatCompletionMessageParam as OpenAIChatCompletionMessageParam)
27
+ from openai.types.chat import (ChatCompletionMessageToolCallParam,
28
+ ChatCompletionToolMessageParam)
29
+ from openai.types.chat.chat_completion_content_part_input_audio_param import (
30
+ InputAudio)
31
+ from PIL import Image
32
+ from pydantic import BaseModel, ConfigDict, TypeAdapter
33
+ # yapf: enable
34
+ from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
35
+ ProcessorMixin)
36
+ # pydantic needs the TypedDict from typing_extensions
37
+ from typing_extensions import Required, TypeAlias, TypedDict
38
+
39
+ from vllm.config import ModelConfig
40
+ from vllm.logger import init_logger
41
+ from vllm.model_executor.model_loader import get_model_cls
42
+ from vllm.model_executor.models import SupportsMultiModal
43
+ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
44
+ from vllm.multimodal.utils import MediaConnector
45
+ # yapf: disable
46
+ from vllm.transformers_utils.chat_templates import (
47
+ get_chat_template_fallback_path)
48
+ # yapf: enable
49
+ from vllm.transformers_utils.processor import cached_get_processor
50
+ from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
51
+ from vllm.utils import deprecate_kwargs, random_uuid
52
+
53
+ logger = init_logger(__name__)
54
+
55
+
56
+ class AudioURL(TypedDict, total=False):
57
+ url: Required[str]
58
+ """
59
+ Either a URL of the audio or a data URL with base64 encoded audio data.
60
+ """
61
+
62
+
63
+ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
64
+ audio_url: Required[AudioURL]
65
+
66
+ type: Required[Literal["audio_url"]]
67
+ """The type of the content part."""
68
+
69
+
70
+ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
71
+ image_embeds: Required[Union[str, dict[str, str]]]
72
+ """
73
+ The image embeddings. It can be either:
74
+ - A single base64 string.
75
+ - A dictionary where each value is a base64 string.
76
+ """
77
+ type: Required[Literal["image_embeds"]]
78
+ """The type of the content part."""
79
+
80
+
81
+ class VideoURL(TypedDict, total=False):
82
+ url: Required[str]
83
+ """
84
+ Either a URL of the video or a data URL with base64 encoded video data.
85
+ """
86
+
87
+
88
+ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
89
+ video_url: Required[VideoURL]
90
+
91
+ type: Required[Literal["video_url"]]
92
+ """The type of the content part."""
93
+
94
+
95
+ class PILImage(BaseModel):
96
+ """
97
+ A PIL.Image.Image object.
98
+ """
99
+ image_pil: Image.Image
100
+ model_config = ConfigDict(arbitrary_types_allowed=True)
101
+
102
+
103
+ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
104
+ """A simpler version of the param that only accepts a PIL image.
105
+
106
+ Example:
107
+ {
108
+ "image_pil": ImageAsset('cherry_blossom').pil_image
109
+ }
110
+ """
111
+ image_pil: Required[PILImage]
112
+
113
+
114
+ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
115
+ """A simpler version of the param that only accepts a plain image_url.
116
+ This is supported by OpenAI API, although it is not documented.
117
+
118
+ Example:
119
+ {
120
+ "image_url": "https://example.com/image.jpg"
121
+ }
122
+ """
123
+ image_url: Required[str]
124
+
125
+
126
+ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
127
+ """A simpler version of the param that only accepts a plain audio_url.
128
+
129
+ Example:
130
+ {
131
+ "audio_url": "https://example.com/audio.mp3"
132
+ }
133
+ """
134
+ audio_url: Required[str]
135
+
136
+
137
+ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
138
+ """A simpler version of the param that only accepts a plain audio_url.
139
+
140
+ Example:
141
+ {
142
+ "video_url": "https://example.com/video.mp4"
143
+ }
144
+ """
145
+ video_url: Required[str]
146
+
147
+
148
+ ChatCompletionContentPartParam: TypeAlias = Union[
149
+ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
150
+ ChatCompletionContentPartInputAudioParam,
151
+ ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
152
+ CustomChatCompletionContentPILImageParam,
153
+ CustomChatCompletionContentSimpleImageParam,
154
+ ChatCompletionContentPartImageEmbedsParam,
155
+ CustomChatCompletionContentSimpleAudioParam,
156
+ CustomChatCompletionContentSimpleVideoParam, str]
157
+
158
+
159
+ class CustomChatCompletionMessageParam(TypedDict, total=False):
160
+ """Enables custom roles in the Chat Completion API."""
161
+ role: Required[str]
162
+ """The role of the message's author."""
163
+
164
+ content: Union[str, list[ChatCompletionContentPartParam]]
165
+ """The contents of the message."""
166
+
167
+ name: str
168
+ """An optional name for the participant.
169
+
170
+ Provides the model information to differentiate between participants of the
171
+ same role.
172
+ """
173
+
174
+ tool_call_id: Optional[str]
175
+ """Tool call that this message is responding to."""
176
+
177
+ tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
178
+ """The tool calls generated by the model, such as function calls."""
179
+
180
+
181
+ ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
182
+ CustomChatCompletionMessageParam]
183
+
184
+
185
+ # TODO: Make fields ReadOnly once mypy supports it
186
+ class ConversationMessage(TypedDict, total=False):
187
+ role: Required[str]
188
+ """The role of the message's author."""
189
+
190
+ content: Union[Optional[str], list[dict[str, str]]]
191
+ """The contents of the message"""
192
+
193
+ tool_call_id: Optional[str]
194
+ """Tool call that this message is responding to."""
195
+
196
+ name: Optional[str]
197
+ """The name of the function to call"""
198
+
199
+ tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
200
+ """The tool calls generated by the model, such as function calls."""
201
+
202
+
203
+ # Passed in by user
204
+ ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
205
+
206
+ # Used internally
207
+ _ChatTemplateContentFormat = Literal["string", "openai"]
208
+
209
+
210
+ def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
211
+ if isinstance(node, jinja2.nodes.Name):
212
+ return node.ctx == "load" and node.name == varname
213
+
214
+ return False
215
+
216
+
217
+ def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
218
+ if isinstance(node, jinja2.nodes.Getitem):
219
+ return (_is_var_access(node.node, varname)
220
+ and isinstance(node.arg, jinja2.nodes.Const)
221
+ and node.arg.value == key)
222
+
223
+ if isinstance(node, jinja2.nodes.Getattr):
224
+ return _is_var_access(node.node, varname) and node.attr == key
225
+
226
+ return False
227
+
228
+
229
+ def _is_var_or_elems_access(
230
+ node: jinja2.nodes.Node,
231
+ varname: str,
232
+ key: Optional[str] = None,
233
+ ) -> bool:
234
+ if isinstance(node, jinja2.nodes.Filter):
235
+ return (node.node is not None
236
+ and _is_var_or_elems_access(node.node, varname, key))
237
+ if isinstance(node, jinja2.nodes.Test):
238
+ return _is_var_or_elems_access(node.node, varname, key)
239
+
240
+ if (isinstance(node, jinja2.nodes.Getitem)
241
+ and isinstance(node.arg, jinja2.nodes.Slice)):
242
+ return _is_var_or_elems_access(node.node, varname, key)
243
+
244
+ # yapf: disable
245
+ return (
246
+ _is_attr_access(node, varname, key) if key
247
+ else _is_var_access(node, varname)
248
+ ) # yapf: enable
249
+
250
+
251
+ def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
252
+ # Global variable that is implicitly defined at the root
253
+ yield root, varname
254
+
255
+ # Iterative BFS
256
+ related_varnames = deque([varname])
257
+ while related_varnames:
258
+ related_varname = related_varnames.popleft()
259
+
260
+ for assign_ast in root.find_all(jinja2.nodes.Assign):
261
+ lhs = assign_ast.target
262
+ rhs = assign_ast.node
263
+
264
+ if _is_var_or_elems_access(rhs, related_varname):
265
+ assert isinstance(lhs, jinja2.nodes.Name)
266
+ yield assign_ast, lhs.name
267
+
268
+ # Avoid infinite looping for self-assignment
269
+ if lhs.name != related_varname:
270
+ related_varnames.append(lhs.name)
271
+
272
+
273
+ # NOTE: The proper way to handle this is to build a CFG so that we can handle
274
+ # the scope in which each variable is defined, but that is too complicated
275
+ def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node):
276
+ messages_varnames = [
277
+ varname
278
+ for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
279
+ ]
280
+
281
+ # Search for {%- for message in messages -%} loops
282
+ for loop_ast in root.find_all(jinja2.nodes.For):
283
+ loop_iter = loop_ast.iter
284
+ loop_target = loop_ast.target
285
+
286
+ for varname in messages_varnames:
287
+ if _is_var_or_elems_access(loop_iter, varname):
288
+ assert isinstance(loop_target, jinja2.nodes.Name)
289
+ yield loop_ast, loop_target.name
290
+ break
291
+
292
+
293
+ def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
294
+ message_varnames = [
295
+ varname for _, varname in _iter_nodes_assign_messages_item(root)
296
+ ]
297
+
298
+ # Search for {%- for content in message['content'] -%} loops
299
+ for loop_ast in root.find_all(jinja2.nodes.For):
300
+ loop_iter = loop_ast.iter
301
+ loop_target = loop_ast.target
302
+
303
+ for varname in message_varnames:
304
+ if _is_var_or_elems_access(loop_iter, varname, "content"):
305
+ assert isinstance(loop_target, jinja2.nodes.Name)
306
+ yield loop_ast, loop_target.name
307
+ break
308
+
309
+
310
+ def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
311
+ try:
312
+ jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
313
+ return jinja_compiled.environment.parse(chat_template)
314
+ except Exception:
315
+ logger.exception("Error when compiling Jinja template")
316
+ return None
317
+
318
+
319
+ @lru_cache(maxsize=32)
320
+ def _detect_content_format(
321
+ chat_template: str,
322
+ *,
323
+ default: _ChatTemplateContentFormat,
324
+ ) -> _ChatTemplateContentFormat:
325
+ jinja_ast = _try_extract_ast(chat_template)
326
+ if jinja_ast is None:
327
+ return default
328
+
329
+ try:
330
+ next(_iter_nodes_assign_content_item(jinja_ast))
331
+ except StopIteration:
332
+ return "string"
333
+ except Exception:
334
+ logger.exception("Error when parsing AST of Jinja template")
335
+ return default
336
+ else:
337
+ return "openai"
338
+
339
+
340
+ def resolve_mistral_chat_template(
341
+ chat_template: Optional[str],
342
+ **kwargs: Any,
343
+ ) -> Optional[str]:
344
+ if chat_template is not None:
345
+ logger.warning_once(
346
+ "'chat_template' cannot be overridden for mistral tokenizer.")
347
+ if "add_generation_prompt" in kwargs:
348
+ logger.warning_once(
349
+ "'add_generation_prompt' is not supported for mistral tokenizer, "
350
+ "so it will be ignored.")
351
+ if "continue_final_message" in kwargs:
352
+ logger.warning_once(
353
+ "'continue_final_message' is not supported for mistral tokenizer, "
354
+ "so it will be ignored.")
355
+ return None
356
+
357
+ @deprecate_kwargs(
358
+ "trust_remote_code",
359
+ additional_message="Please use `model_config.trust_remote_code` instead.",
360
+ )
361
+ def resolve_hf_chat_template(
362
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
363
+ chat_template: Optional[str],
364
+ tools: Optional[list[dict[str, Any]]],
365
+ *,
366
+ model_config: ModelConfig,
367
+ trust_remote_code: Optional[bool] = None,
368
+ ) -> Optional[str]:
369
+ # 1st priority: The given chat template
370
+ if chat_template is not None:
371
+ return chat_template
372
+
373
+ # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
374
+ if tools is None:
375
+ try:
376
+ processor = cached_get_processor(
377
+ tokenizer.name_or_path,
378
+ processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast,
379
+ ProcessorMixin),
380
+ trust_remote_code=model_config.trust_remote_code,
381
+ )
382
+ if isinstance(processor, ProcessorMixin) and \
383
+ hasattr(processor, 'chat_template') and \
384
+ processor.chat_template is not None:
385
+ return processor.chat_template
386
+ except Exception:
387
+ logger.debug("Failed to load AutoProcessor chat template for %s", tokenizer.name_or_path, exc_info=True) # noqa: E501
388
+
389
+ # 3rd priority: AutoTokenizer chat template
390
+ try:
391
+ return tokenizer.get_chat_template(chat_template, tools=tools)
392
+ except Exception:
393
+ logger.debug("Failed to load AutoTokenizer chat template for %s",
394
+ tokenizer.name_or_path, exc_info=True)
395
+
396
+ # 4th priority: Predefined fallbacks
397
+ path = get_chat_template_fallback_path(
398
+ model_type=model_config.hf_config.model_type,
399
+ tokenizer_name_or_path=model_config.tokenizer,
400
+ )
401
+ if path is not None:
402
+ logger.info("Loading chat template fallback for %s as there isn't one "
403
+ "defined on HF Hub.", tokenizer.name_or_path)
404
+ chat_template = load_chat_template(path)
405
+ else:
406
+ logger.debug("There is no chat template fallback for %s",
407
+ tokenizer.name_or_path)
408
+
409
+ return chat_template
410
+
411
+
412
+ def _resolve_chat_template_content_format(
413
+ chat_template: Optional[str],
414
+ tools: Optional[list[dict[str, Any]]],
415
+ tokenizer: AnyTokenizer,
416
+ *,
417
+ model_config: ModelConfig,
418
+ ) -> _ChatTemplateContentFormat:
419
+ if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
420
+ hf_chat_template = resolve_hf_chat_template(
421
+ tokenizer,
422
+ chat_template=chat_template,
423
+ tools=tools,
424
+ model_config=model_config,
425
+ )
426
+ else:
427
+ hf_chat_template = None
428
+
429
+ jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
430
+ else load_chat_template(chat_template, is_literal=True))
431
+
432
+ detected_format = ("string" if jinja_text is None else
433
+ _detect_content_format(jinja_text, default="string"))
434
+
435
+ return detected_format
436
+
437
+
438
+ @lru_cache
439
+ def _log_chat_template_content_format(
440
+ chat_template: Optional[str],
441
+ given_format: ChatTemplateContentFormatOption,
442
+ detected_format: ChatTemplateContentFormatOption,
443
+ ):
444
+ logger.info(
445
+ "Detected the chat template content format to be '%s'. "
446
+ "You can set `--chat-template-content-format` to override this.",
447
+ detected_format,
448
+ )
449
+
450
+ if given_format != "auto" and given_format != detected_format:
451
+ logger.warning(
452
+ "You specified `--chat-template-content-format %s` "
453
+ "which is different from the detected format '%s'. "
454
+ "If our automatic detection is incorrect, please consider "
455
+ "opening a GitHub issue so that we can improve it: "
456
+ "https://github.com/vllm-project/vllm/issues/new/choose",
457
+ given_format,
458
+ detected_format,
459
+ )
460
+
461
+
462
+ @deprecate_kwargs(
463
+ "trust_remote_code",
464
+ additional_message="Please use `model_config.trust_remote_code` instead.",
465
+ )
466
+ def resolve_chat_template_content_format(
467
+ chat_template: Optional[str],
468
+ tools: Optional[list[dict[str, Any]]],
469
+ given_format: ChatTemplateContentFormatOption,
470
+ tokenizer: AnyTokenizer,
471
+ *,
472
+ model_config: ModelConfig,
473
+ trust_remote_code: Optional[bool] = None,
474
+ ) -> _ChatTemplateContentFormat:
475
+ if given_format != "auto":
476
+ return given_format
477
+
478
+ detected_format = _resolve_chat_template_content_format(
479
+ chat_template,
480
+ tools,
481
+ tokenizer,
482
+ model_config=model_config,
483
+ )
484
+
485
+ _log_chat_template_content_format(
486
+ chat_template,
487
+ given_format=given_format,
488
+ detected_format=detected_format,
489
+ )
490
+
491
+ return detected_format
492
+
493
+
494
+
495
+ ModalityStr = Literal["image", "audio", "video", "image_embeds"]
496
+ _T = TypeVar("_T")
497
+
498
+
499
+ class BaseMultiModalItemTracker(ABC, Generic[_T]):
500
+ """
501
+ Tracks multi-modal items in a given request and ensures that the number
502
+ of multi-modal items in a given request does not exceed the configured
503
+ maximum per prompt.
504
+ """
505
+
506
+ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
507
+ super().__init__()
508
+
509
+ self._model_config = model_config
510
+ self._tokenizer = tokenizer
511
+
512
+ self._items_by_modality = defaultdict[str, list[_T]](list)
513
+
514
+ @property
515
+ def model_config(self) -> ModelConfig:
516
+ return self._model_config
517
+
518
+ @cached_property
519
+ def model_cls(self):
520
+ return get_model_cls(self.model_config)
521
+
522
+ @property
523
+ def allowed_local_media_path(self):
524
+ return self._model_config.allowed_local_media_path
525
+
526
+ @property
527
+ def mm_registry(self):
528
+ return MULTIMODAL_REGISTRY
529
+
530
+ def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
531
+ """
532
+ Add a multi-modal item to the current prompt and returns the
533
+ placeholder string to use, if any.
534
+ """
535
+ mm_registry = self.mm_registry
536
+ model_config = self.model_config
537
+ model_cls = cast(SupportsMultiModal, self.model_cls)
538
+
539
+ input_modality = modality.replace("_embeds", "")
540
+
541
+ if mm_registry.has_processor(model_config):
542
+ mm_processor = mm_registry.create_processor(model_config)
543
+ allowed_counts = mm_processor.info.get_allowed_mm_limits()
544
+ allowed_count = allowed_counts.get(input_modality, 0)
545
+ else:
546
+ mm_config = model_config.multimodal_config
547
+ if mm_config is None:
548
+ msg = "This model does not support multi-modal inputs"
549
+ raise ValueError(msg)
550
+
551
+ allowed_count = mm_config.get_limit_per_prompt(input_modality)
552
+
553
+ current_count = len(self._items_by_modality[modality]) + 1
554
+ if current_count > allowed_count:
555
+ raise ValueError(
556
+ f"At most {allowed_count} {modality}(s) may be provided in "
557
+ "one request. You can set `--limit-mm-per-prompt` to "
558
+ "increase this limit if the model supports it.")
559
+
560
+ self._items_by_modality[modality].append(item)
561
+
562
+ return model_cls.get_placeholder_str(modality, current_count)
563
+
564
+ @abstractmethod
565
+ def create_parser(self) -> "BaseMultiModalContentParser":
566
+ raise NotImplementedError
567
+
568
+
569
+ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
570
+
571
+ def all_mm_data(self) -> Optional[MultiModalDataDict]:
572
+ if not self._items_by_modality:
573
+ return None
574
+ mm_inputs = {}
575
+ items_by_modality = dict(self._items_by_modality)
576
+ if "image" in items_by_modality and "image_embeds" in items_by_modality:
577
+ raise ValueError(\
578
+ "Mixing raw image and embedding inputs is not allowed")
579
+
580
+ if "image_embeds" in items_by_modality:
581
+ image_embeds_lst = items_by_modality["image_embeds"]
582
+ if len(image_embeds_lst) > 1:
583
+ raise ValueError(\
584
+ "Only one message can have {'type': 'image_embeds'}")
585
+ mm_inputs["image"] = image_embeds_lst[0]
586
+ if "image" in items_by_modality:
587
+ mm_inputs["image"] = items_by_modality["image"] # A list of images
588
+ if "audio" in items_by_modality:
589
+ mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
590
+ if "video" in items_by_modality:
591
+ mm_inputs["video"] = items_by_modality["video"] # A list of videos
592
+ return mm_inputs
593
+
594
+ def create_parser(self) -> "BaseMultiModalContentParser":
595
+ return MultiModalContentParser(self)
596
+
597
+
598
+ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
599
+
600
+ async def all_mm_data(self) -> Optional[MultiModalDataDict]:
601
+ if not self._items_by_modality:
602
+ return None
603
+ mm_inputs = {}
604
+ items_by_modality = {
605
+ modality: await asyncio.gather(*items)
606
+ for modality, items in self._items_by_modality.items()
607
+ }
608
+
609
+ if "image" in items_by_modality and "image_embeds" in items_by_modality:
610
+ raise ValueError(
611
+ "Mixing raw image and embedding inputs is not allowed")
612
+
613
+ if "image_embeds" in items_by_modality:
614
+ image_embeds_lst = items_by_modality["image_embeds"]
615
+ if len(image_embeds_lst) > 1:
616
+ raise ValueError(
617
+ "Only one message can have {'type': 'image_embeds'}")
618
+ mm_inputs["image"] = image_embeds_lst[0]
619
+ if "image" in items_by_modality:
620
+ mm_inputs["image"] = items_by_modality["image"] # A list of images
621
+ if "audio" in items_by_modality:
622
+ mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
623
+ if "video" in items_by_modality:
624
+ mm_inputs["video"] = items_by_modality["video"] # A list of videos
625
+ return mm_inputs
626
+
627
+ def create_parser(self) -> "BaseMultiModalContentParser":
628
+ return AsyncMultiModalContentParser(self)
629
+
630
+
631
+ class BaseMultiModalContentParser(ABC):
632
+
633
+ def __init__(self) -> None:
634
+ super().__init__()
635
+
636
+ # multimodal placeholder_string : count
637
+ self._placeholder_counts: dict[str, int] = defaultdict(lambda: 0)
638
+
639
+ def _add_placeholder(self, placeholder: Optional[str]):
640
+ if placeholder:
641
+ self._placeholder_counts[placeholder] += 1
642
+
643
+ def mm_placeholder_counts(self) -> dict[str, int]:
644
+ return dict(self._placeholder_counts)
645
+
646
+ @abstractmethod
647
+ def parse_image(self, image_url: str) -> None:
648
+ raise NotImplementedError
649
+
650
+ @abstractmethod
651
+ def parse_image_embeds(self,
652
+ image_embeds: Union[str, dict[str, str]]) -> None:
653
+ raise NotImplementedError
654
+
655
+ @abstractmethod
656
+ def parse_image_pil(self, image_pil: Image.Image) -> None:
657
+ raise NotImplementedError
658
+
659
+ @abstractmethod
660
+ def parse_audio(self, audio_url: str) -> None:
661
+ raise NotImplementedError
662
+
663
+ @abstractmethod
664
+ def parse_input_audio(self, input_audio: InputAudio) -> None:
665
+ raise NotImplementedError
666
+
667
+ @abstractmethod
668
+ def parse_video(self, video_url: str) -> None:
669
+ raise NotImplementedError
670
+
671
+
672
+ class MultiModalContentParser(BaseMultiModalContentParser):
673
+
674
+ def __init__(self, tracker: MultiModalItemTracker) -> None:
675
+ super().__init__()
676
+
677
+ self._tracker = tracker
678
+
679
+ self._connector = MediaConnector(
680
+ media_io_kwargs=self._tracker._model_config.media_io_kwargs,
681
+ allowed_local_media_path=tracker.allowed_local_media_path,
682
+ )
683
+
684
+ def parse_image(self, image_url: str) -> None:
685
+ image = self._connector.fetch_image(image_url)
686
+
687
+ placeholder = self._tracker.add("image", image)
688
+ self._add_placeholder(placeholder)
689
+
690
+ def parse_image_embeds(self,
691
+ image_embeds: Union[str, dict[str, str]]) -> None:
692
+ if isinstance(image_embeds, dict):
693
+ embeds = {
694
+ k: self._connector.fetch_image_embedding(v)
695
+ for k, v in image_embeds.items()
696
+ }
697
+ placeholder = self._tracker.add("image_embeds", embeds)
698
+
699
+ if isinstance(image_embeds, str):
700
+ embedding = self._connector.fetch_image_embedding(image_embeds)
701
+ placeholder = self._tracker.add("image_embeds", embedding)
702
+
703
+ self._add_placeholder(placeholder)
704
+
705
+ def parse_image_pil(self, image_pil: Image.Image) -> None:
706
+ placeholder = self._tracker.add("image", image_pil)
707
+ self._add_placeholder(placeholder)
708
+
709
+ def parse_audio(self, audio_url: str) -> None:
710
+ audio = self._connector.fetch_audio(audio_url)
711
+
712
+ placeholder = self._tracker.add("audio", audio)
713
+ self._add_placeholder(placeholder)
714
+
715
+ def parse_input_audio(self, input_audio: InputAudio) -> None:
716
+ audio_data = input_audio.get("data", "")
717
+ audio_format = input_audio.get("format", "")
718
+ audio_url = f"data:audio/{audio_format};base64,{audio_data}"
719
+
720
+ return self.parse_audio(audio_url)
721
+
722
+ def parse_video(self, video_url: str) -> None:
723
+ video = self._connector.fetch_video(video_url=video_url)
724
+
725
+ placeholder = self._tracker.add("video", video)
726
+ self._add_placeholder(placeholder)
727
+
728
+
729
+ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
730
+
731
+ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
732
+ super().__init__()
733
+
734
+ self._tracker = tracker
735
+ self._connector = MediaConnector(
736
+ media_io_kwargs=self._tracker._model_config.media_io_kwargs,
737
+ allowed_local_media_path=tracker.allowed_local_media_path
738
+ )
739
+
740
+ def parse_image(self, image_url: str) -> None:
741
+ image_coro = self._connector.fetch_image_async(image_url)
742
+
743
+ placeholder = self._tracker.add("image", image_coro)
744
+ self._add_placeholder(placeholder)
745
+
746
+ def parse_image_embeds(self,
747
+ image_embeds: Union[str, dict[str, str]]) -> None:
748
+ future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
749
+
750
+ if isinstance(image_embeds, dict):
751
+ embeds = {
752
+ k: self._connector.fetch_image_embedding(v)
753
+ for k, v in image_embeds.items()
754
+ }
755
+ future.set_result(embeds)
756
+
757
+ if isinstance(image_embeds, str):
758
+ embedding = self._connector.\
759
+ fetch_image_embedding(image_embeds)
760
+ future.set_result(embedding)
761
+
762
+ placeholder = self._tracker.add("image_embeds", future)
763
+ self._add_placeholder(placeholder)
764
+
765
+ def parse_image_pil(self, image_pil: Image.Image) -> None:
766
+ future: asyncio.Future[Image.Image] = asyncio.Future()
767
+ future.set_result(image_pil)
768
+
769
+ placeholder = self._tracker.add("image", future)
770
+ self._add_placeholder(placeholder)
771
+
772
+ def parse_audio(self, audio_url: str) -> None:
773
+ audio_coro = self._connector.fetch_audio_async(audio_url)
774
+
775
+ placeholder = self._tracker.add("audio", audio_coro)
776
+ self._add_placeholder(placeholder)
777
+
778
+ def parse_input_audio(self, input_audio: InputAudio) -> None:
779
+ audio_data = input_audio.get("data", "")
780
+ audio_format = input_audio.get("format", "")
781
+ audio_url = f"data:audio/{audio_format};base64,{audio_data}"
782
+
783
+ return self.parse_audio(audio_url)
784
+
785
+ def parse_video(self, video_url: str) -> None:
786
+ video = self._connector.fetch_video_async(video_url=video_url)
787
+
788
+ placeholder = self._tracker.add("video", video)
789
+ self._add_placeholder(placeholder)
790
+
791
+
792
+ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
793
+ """Raises if the provided chat template appears invalid."""
794
+ if chat_template is None:
795
+ return
796
+
797
+ elif isinstance(chat_template, Path) and not chat_template.exists():
798
+ raise FileNotFoundError(
799
+ "the supplied chat template path doesn't exist")
800
+
801
+ elif isinstance(chat_template, str):
802
+ JINJA_CHARS = "{}\n"
803
+ if not any(c in chat_template
804
+ for c in JINJA_CHARS) and not Path(chat_template).exists():
805
+ raise ValueError(
806
+ f"The supplied chat template string ({chat_template}) "
807
+ f"appears path-like, but doesn't exist!")
808
+
809
+ else:
810
+ raise TypeError(
811
+ f"{type(chat_template)} is not a valid chat template type")
812
+
813
+
814
+ def _load_chat_template(
815
+ chat_template: Optional[Union[Path, str]],
816
+ *,
817
+ is_literal: bool = False,
818
+ ) -> Optional[str]:
819
+ if chat_template is None:
820
+ return None
821
+
822
+ if is_literal:
823
+ if isinstance(chat_template, Path):
824
+ raise TypeError("chat_template is expected to be read directly "
825
+ "from its value")
826
+
827
+ return chat_template
828
+
829
+ try:
830
+ with open(chat_template) as f:
831
+ return f.read()
832
+ except OSError as e:
833
+ if isinstance(chat_template, Path):
834
+ raise
835
+
836
+ JINJA_CHARS = "{}\n"
837
+ if not any(c in chat_template for c in JINJA_CHARS):
838
+ msg = (f"The supplied chat template ({chat_template}) "
839
+ f"looks like a file path, but it failed to be "
840
+ f"opened. Reason: {e}")
841
+ raise ValueError(msg) from e
842
+
843
+ # If opening a file fails, set chat template to be args to
844
+ # ensure we decode so our escape are interpreted correctly
845
+ return _load_chat_template(chat_template, is_literal=True)
846
+
847
+
848
+ _cached_load_chat_template = lru_cache(_load_chat_template)
849
+
850
+
851
+ def load_chat_template(
852
+ chat_template: Optional[Union[Path, str]],
853
+ *,
854
+ is_literal: bool = False,
855
+ ) -> Optional[str]:
856
+ return _cached_load_chat_template(chat_template, is_literal=is_literal)
857
+
858
+
859
+ # TODO: Let user specify how to insert multimodal tokens into prompt
860
+ # (similar to chat template)
861
+ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
862
+ text_prompt: str) -> str:
863
+ """Combine multimodal prompts for a multimodal language model."""
864
+
865
+ # Look through the text prompt to check for missing placeholders
866
+ missing_placeholders: list[str] = []
867
+ for placeholder in placeholder_counts:
868
+
869
+ # For any existing placeholder in the text prompt, we leave it as is
870
+ placeholder_counts[placeholder] -= text_prompt.count(placeholder)
871
+
872
+ if placeholder_counts[placeholder] < 0:
873
+ raise ValueError(
874
+ f"Found more '{placeholder}' placeholders in input prompt than "
875
+ "actual multimodal data items.")
876
+
877
+ missing_placeholders.extend([placeholder] *
878
+ placeholder_counts[placeholder])
879
+
880
+ # NOTE: For now we always add missing placeholders at the front of
881
+ # the prompt. This may change to be customizable in the future.
882
+ return "\n".join(missing_placeholders + [text_prompt])
883
+
884
+
885
+ # No need to validate using Pydantic again
886
+ _TextParser = partial(cast, ChatCompletionContentPartTextParam)
887
+ _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
888
+ _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
889
+ _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
890
+ _PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
891
+ # Need to validate url objects
892
+ _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
893
+ _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
894
+ _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
895
+
896
+ _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
897
+
898
+ # Define a mapping from part types to their corresponding parsing functions.
899
+ MM_PARSER_MAP: dict[
900
+ str,
901
+ Callable[[ChatCompletionContentPartParam], _ContentPart],
902
+ ] = {
903
+ "text":
904
+ lambda part: _TextParser(part).get("text", None),
905
+ "image_url":
906
+ lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
907
+ "image_embeds":
908
+ lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
909
+ "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
910
+ "audio_url":
911
+ lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
912
+ "input_audio":
913
+ lambda part: _InputAudioParser(part).get("input_audio", None),
914
+ "refusal":
915
+ lambda part: _RefusalParser(part).get("refusal", None),
916
+ "video_url":
917
+ lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
918
+ }
919
+
920
+
921
+ def _parse_chat_message_content_mm_part(
922
+ part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]:
923
+ """
924
+ Parses a given multi-modal content part based on its type.
925
+
926
+ Args:
927
+ part: A dict containing the content part, with a potential 'type' field.
928
+
929
+ Returns:
930
+ A tuple (part_type, content) where:
931
+ - part_type: Type of the part (e.g., 'text', 'image_url').
932
+ - content: Parsed content (e.g., text, image URL).
933
+
934
+ Raises:
935
+ ValueError: If the 'type' field is missing and no direct URL is found.
936
+ """
937
+ assert isinstance(
938
+ part, dict) # This is needed to avoid mypy errors: part.get() from str
939
+ part_type = part.get("type", None)
940
+
941
+ if isinstance(part_type, str) and part_type in MM_PARSER_MAP:
942
+ content = MM_PARSER_MAP[part_type](part)
943
+
944
+ # Special case for 'image_url.detail'
945
+ # We only support 'auto', which is the default
946
+ if part_type == "image_url" and part.get("detail", "auto") != "auto":
947
+ logger.warning("'image_url.detail' is currently not supported "
948
+ "and will be ignored.")
949
+
950
+ return part_type, content
951
+
952
+ # Handle missing 'type' but provided direct URL fields.
953
+ # 'type' is required field by pydantic
954
+ if part_type is None:
955
+ if part.get("image_url") is not None:
956
+ image_params = cast(CustomChatCompletionContentSimpleImageParam,
957
+ part)
958
+ return "image_url", image_params.get("image_url", "")
959
+ if part.get("audio_url") is not None:
960
+ audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
961
+ part)
962
+ return "audio_url", audio_params.get("audio_url", "")
963
+ if part.get("input_audio") is not None:
964
+ input_audio_params = cast(dict[str, str], part)
965
+ return "input_audio", input_audio_params
966
+ if part.get("video_url") is not None:
967
+ video_params = cast(CustomChatCompletionContentSimpleVideoParam,
968
+ part)
969
+ return "video_url", video_params.get("video_url", "")
970
+ # Raise an error if no 'type' or direct URL is found.
971
+ raise ValueError("Missing 'type' field in multimodal part.")
972
+
973
+ if not isinstance(part_type, str):
974
+ raise ValueError("Invalid 'type' field in multimodal part.")
975
+ return part_type, "unknown part_type content"
976
+
977
+
978
+ VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
979
+ "image_embeds", "image_pil",
980
+ "audio_url", "input_audio", "video_url")
981
+
982
+
983
+ def _parse_chat_message_content_parts(
984
+ role: str,
985
+ parts: Iterable[ChatCompletionContentPartParam],
986
+ mm_tracker: BaseMultiModalItemTracker,
987
+ *,
988
+ wrap_dicts: bool,
989
+ ) -> list[ConversationMessage]:
990
+ content = list[_ContentPart]()
991
+
992
+ mm_parser = mm_tracker.create_parser()
993
+
994
+ for part in parts:
995
+ parse_res = _parse_chat_message_content_part(
996
+ part,
997
+ mm_parser,
998
+ wrap_dicts=wrap_dicts,
999
+ )
1000
+ if parse_res:
1001
+ content.append(parse_res)
1002
+
1003
+ if wrap_dicts:
1004
+ # Parsing wraps images and texts as interleaved dictionaries
1005
+ return [ConversationMessage(role=role,
1006
+ content=content)] # type: ignore
1007
+ texts = cast(list[str], content)
1008
+ text_prompt = "\n".join(texts)
1009
+ mm_placeholder_counts = mm_parser.mm_placeholder_counts()
1010
+ if mm_placeholder_counts:
1011
+ text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts,
1012
+ text_prompt)
1013
+ return [ConversationMessage(role=role, content=text_prompt)]
1014
+
1015
+
1016
+ def _parse_chat_message_content_part(
1017
+ part: ChatCompletionContentPartParam,
1018
+ mm_parser: BaseMultiModalContentParser,
1019
+ *,
1020
+ wrap_dicts: bool,
1021
+ ) -> Optional[_ContentPart]:
1022
+ """Parses a single part of a conversation. If wrap_dicts is True,
1023
+ structured dictionary pieces for texts and images will be
1024
+ wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
1025
+ {"type": "image"}, respectively. Otherwise multimodal data will be
1026
+ handled by mm_parser, and texts will be returned as strings to be joined
1027
+ with multimodal placeholders.
1028
+ """
1029
+ if isinstance(part, str): # Handle plain text parts
1030
+ return part
1031
+
1032
+ # Handle structured dictionary parts
1033
+ part_type, content = _parse_chat_message_content_mm_part(part)
1034
+
1035
+ # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
1036
+ # content is None, log a warning and skip
1037
+ if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
1038
+ logger.warning(
1039
+ "Skipping multimodal part '%s' (type: '%s') "
1040
+ "with empty / unparsable content.", part, part_type)
1041
+ return None
1042
+
1043
+ if part_type in ("text", "refusal"):
1044
+ str_content = cast(str, content)
1045
+ if wrap_dicts:
1046
+ return {'type': 'text', 'text': str_content}
1047
+ else:
1048
+ return str_content
1049
+
1050
+ if part_type == "image_pil":
1051
+ image_content = cast(Image.Image, content)
1052
+ mm_parser.parse_image_pil(image_content)
1053
+ return {'type': 'image'} if wrap_dicts else None
1054
+ if part_type == "image_url":
1055
+ str_content = cast(str, content)
1056
+ mm_parser.parse_image(str_content)
1057
+ return {'type': 'image'} if wrap_dicts else None
1058
+ if part_type == "image_embeds":
1059
+ content = cast(Union[str, dict[str, str]], content)
1060
+ mm_parser.parse_image_embeds(content)
1061
+ return {'type': 'image'} if wrap_dicts else None
1062
+ if part_type == "audio_url":
1063
+ str_content = cast(str, content)
1064
+ mm_parser.parse_audio(str_content)
1065
+ return {'type': 'audio'} if wrap_dicts else None
1066
+
1067
+ if part_type == "input_audio":
1068
+ dict_content = cast(InputAudio, content)
1069
+ mm_parser.parse_input_audio(dict_content)
1070
+ return {'type': 'audio'} if wrap_dicts else None
1071
+
1072
+ if part_type == "video_url":
1073
+ str_content = cast(str, content)
1074
+ mm_parser.parse_video(str_content)
1075
+ return {'type': 'video'} if wrap_dicts else None
1076
+
1077
+ raise NotImplementedError(f"Unknown part type: {part_type}")
1078
+
1079
+
1080
+ # No need to validate using Pydantic again
1081
+ _AssistantParser = partial(cast, ChatCompletionAssistantMessageParam)
1082
+ _ToolParser = partial(cast, ChatCompletionToolMessageParam)
1083
+
1084
+
1085
+ def _parse_chat_message_content(
1086
+ message: ChatCompletionMessageParam,
1087
+ mm_tracker: BaseMultiModalItemTracker,
1088
+ content_format: _ChatTemplateContentFormat,
1089
+ ) -> list[ConversationMessage]:
1090
+ role = message["role"]
1091
+ content = message.get("content")
1092
+
1093
+ if content is None:
1094
+ content = []
1095
+ elif isinstance(content, str):
1096
+ content = [
1097
+ ChatCompletionContentPartTextParam(type="text", text=content)
1098
+ ]
1099
+ result = _parse_chat_message_content_parts(
1100
+ role,
1101
+ content, # type: ignore
1102
+ mm_tracker,
1103
+ wrap_dicts=(content_format == "openai"),
1104
+ )
1105
+
1106
+ for result_msg in result:
1107
+ if role == 'assistant':
1108
+ parsed_msg = _AssistantParser(message)
1109
+
1110
+ # The 'tool_calls' is not None check ensures compatibility.
1111
+ # It's needed only if downstream code doesn't strictly
1112
+ # follow the OpenAI spec.
1113
+ if ("tool_calls" in parsed_msg
1114
+ and parsed_msg["tool_calls"] is not None):
1115
+ result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
1116
+ elif role == "tool":
1117
+ parsed_msg = _ToolParser(message)
1118
+ if "tool_call_id" in parsed_msg:
1119
+ result_msg["tool_call_id"] = parsed_msg["tool_call_id"]
1120
+
1121
+ if "name" in message and isinstance(message["name"], str):
1122
+ result_msg["name"] = message["name"]
1123
+
1124
+ return result
1125
+
1126
+
1127
+ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
1128
+ # per the Transformers docs & maintainers, tool call arguments in
1129
+ # assistant-role messages with tool_calls need to be dicts not JSON str -
1130
+ # this is how tool-use chat templates will expect them moving forwards
1131
+ # so, for messages that have tool_calls, parse the string (which we get
1132
+ # from openAI format) to dict
1133
+ for message in messages:
1134
+ if (message["role"] == "assistant" and "tool_calls" in message
1135
+ and isinstance(message["tool_calls"], list)):
1136
+
1137
+ for item in message["tool_calls"]:
1138
+ item["function"]["arguments"] = json.loads(
1139
+ item["function"]["arguments"])
1140
+
1141
+
1142
+ def parse_chat_messages(
1143
+ messages: list[ChatCompletionMessageParam],
1144
+ model_config: ModelConfig,
1145
+ tokenizer: AnyTokenizer,
1146
+ content_format: _ChatTemplateContentFormat,
1147
+ ) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]:
1148
+ conversation: list[ConversationMessage] = []
1149
+ mm_tracker = MultiModalItemTracker(model_config, tokenizer)
1150
+
1151
+ for msg in messages:
1152
+ sub_messages = _parse_chat_message_content(
1153
+ msg,
1154
+ mm_tracker,
1155
+ content_format,
1156
+ )
1157
+
1158
+ conversation.extend(sub_messages)
1159
+
1160
+ _postprocess_messages(conversation)
1161
+
1162
+ return conversation, mm_tracker.all_mm_data()
1163
+
1164
+
1165
+ def parse_chat_messages_futures(
1166
+ messages: list[ChatCompletionMessageParam],
1167
+ model_config: ModelConfig,
1168
+ tokenizer: AnyTokenizer,
1169
+ content_format: _ChatTemplateContentFormat,
1170
+ ) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
1171
+ conversation: list[ConversationMessage] = []
1172
+ mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
1173
+
1174
+ for msg in messages:
1175
+ sub_messages = _parse_chat_message_content(
1176
+ msg,
1177
+ mm_tracker,
1178
+ content_format,
1179
+ )
1180
+
1181
+ conversation.extend(sub_messages)
1182
+
1183
+ _postprocess_messages(conversation)
1184
+
1185
+ return conversation, mm_tracker.all_mm_data()
1186
+
1187
+
1188
+ @deprecate_kwargs(
1189
+ "trust_remote_code",
1190
+ additional_message="Please use `model_config.trust_remote_code` instead.",
1191
+ )
1192
+ def apply_hf_chat_template(
1193
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
1194
+ conversation: list[ConversationMessage],
1195
+ chat_template: Optional[str],
1196
+ tools: Optional[list[dict[str, Any]]],
1197
+ *,
1198
+ model_config: ModelConfig,
1199
+ tokenize: bool = False, # Different from HF's default
1200
+ # Deprecated, explicitly capture here so it doesn't slit into kwargs.
1201
+ trust_remote_code: Optional[bool] = None,
1202
+ **kwargs: Any,
1203
+ ) -> str:
1204
+ hf_chat_template = resolve_hf_chat_template(
1205
+ tokenizer,
1206
+ chat_template=chat_template,
1207
+ tools=tools,
1208
+ model_config=model_config,
1209
+ )
1210
+
1211
+ if hf_chat_template is None:
1212
+ raise ValueError(
1213
+ "As of transformers v4.44, default chat template is no longer "
1214
+ "allowed, so you must provide a chat template if the tokenizer "
1215
+ "does not define one.")
1216
+
1217
+ try:
1218
+
1219
+ return tokenizer.apply_chat_template(
1220
+ conversation=conversation, # type: ignore[arg-type]
1221
+ tools=tools, # type: ignore[arg-type]
1222
+ chat_template=hf_chat_template,
1223
+ tokenize=tokenize,
1224
+ **kwargs,
1225
+ )
1226
+
1227
+ # External library exceptions can sometimes occur despite the framework's
1228
+ # internal exception management capabilities.
1229
+ except Exception as e:
1230
+
1231
+ # Log and report any library-related exceptions for further
1232
+ # investigation.
1233
+ logger.exception(
1234
+ "An error occurred in `transformers` while applying chat template")
1235
+ raise ValueError(str(e)) from e
1236
+
1237
+ def apply_mistral_chat_template(
1238
+ tokenizer: MistralTokenizer,
1239
+ messages: list[ChatCompletionMessageParam],
1240
+ chat_template: Optional[str],
1241
+ tools: Optional[list[dict[str, Any]]],
1242
+ **kwargs: Any,
1243
+ ) -> list[int]:
1244
+ from mistral_common.exceptions import MistralCommonException
1245
+
1246
+ # The return value of resolve_mistral_chat_template is always None,
1247
+ # and we won't use it.
1248
+ resolve_mistral_chat_template(
1249
+ chat_template=chat_template,
1250
+ **kwargs,
1251
+ )
1252
+
1253
+ try:
1254
+ return tokenizer.apply_chat_template(
1255
+ messages=messages,
1256
+ tools=tools,
1257
+ **kwargs,
1258
+ )
1259
+ # mistral-common uses assert statements to stop processing of input
1260
+ # if input does not comply with the expected format.
1261
+ # We convert those assertion errors to ValueErrors so they can be
1262
+ # are properly caught in the preprocessing_input step
1263
+ except (AssertionError, MistralCommonException) as e:
1264
+ raise ValueError(str(e)) from e
1265
+
1266
+ # External library exceptions can sometimes occur despite the framework's
1267
+ # internal exception management capabilities.
1268
+ except Exception as e:
1269
+
1270
+ # Log and report any library-related exceptions for further
1271
+ # investigation.
1272
+ logger.exception(
1273
+ "An error occurred in `mistral_common` while applying chat "
1274
+ "template")
1275
+ raise ValueError(str(e)) from e
1276
+
1277
+ def random_tool_call_id() -> str:
1278
+ return f"chatcmpl-tool-{random_uuid()}"