vllm-cpu 0.9.2.post2__cp311-cp311-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1236) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +214 -0
  3. vllm/_custom_ops.py +1915 -0
  4. vllm/_ipex_ops.py +350 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +139 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +325 -0
  20. vllm/attention/backends/blocksparse_attn.py +465 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1506 -0
  23. vllm/attention/backends/flash_attn.py +1008 -0
  24. vllm/attention/backends/flashinfer.py +1107 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +318 -0
  27. vllm/attention/backends/ipex_attn.py +403 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1391 -0
  30. vllm/attention/backends/pallas.py +356 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +1015 -0
  34. vllm/attention/backends/torch_sdpa.py +707 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +807 -0
  38. vllm/attention/layer.py +481 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +903 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/pallas_kv_cache_update.py +120 -0
  52. vllm/attention/ops/prefix_prefill.py +902 -0
  53. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  54. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  55. vllm/attention/ops/triton_decode_attention.py +674 -0
  56. vllm/attention/ops/triton_flash_attention.py +984 -0
  57. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  58. vllm/attention/ops/triton_unified_attention.py +738 -0
  59. vllm/attention/selector.py +214 -0
  60. vllm/attention/utils/fa_utils.py +72 -0
  61. vllm/beam_search.py +87 -0
  62. vllm/benchmarks/__init__.py +0 -0
  63. vllm/benchmarks/datasets.py +1441 -0
  64. vllm/benchmarks/endpoint_request_func.py +393 -0
  65. vllm/benchmarks/latency.py +168 -0
  66. vllm/benchmarks/serve.py +1063 -0
  67. vllm/benchmarks/throughput.py +609 -0
  68. vllm/benchmarks/utils.py +70 -0
  69. vllm/collect_env.py +820 -0
  70. vllm/compilation/__init__.py +0 -0
  71. vllm/compilation/activation_quant_fusion.py +89 -0
  72. vllm/compilation/backends.py +610 -0
  73. vllm/compilation/base_piecewise_backend.py +72 -0
  74. vllm/compilation/collective_fusion.py +127 -0
  75. vllm/compilation/compiler_interface.py +564 -0
  76. vllm/compilation/counter.py +41 -0
  77. vllm/compilation/cuda_piecewise_backend.py +218 -0
  78. vllm/compilation/decorators.py +250 -0
  79. vllm/compilation/fix_functionalization.py +191 -0
  80. vllm/compilation/fusion.py +645 -0
  81. vllm/compilation/fusion_attn.py +166 -0
  82. vllm/compilation/fx_utils.py +84 -0
  83. vllm/compilation/inductor_pass.py +115 -0
  84. vllm/compilation/monitor.py +39 -0
  85. vllm/compilation/multi_output_match.py +109 -0
  86. vllm/compilation/noop_elimination.py +165 -0
  87. vllm/compilation/pass_manager.py +82 -0
  88. vllm/compilation/sequence_parallelism.py +482 -0
  89. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  90. vllm/compilation/vllm_inductor_pass.py +70 -0
  91. vllm/compilation/wrapper.py +135 -0
  92. vllm/config.py +4913 -0
  93. vllm/connections.py +174 -0
  94. vllm/core/__init__.py +0 -0
  95. vllm/core/block/__init__.py +0 -0
  96. vllm/core/block/block_table.py +399 -0
  97. vllm/core/block/common.py +371 -0
  98. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  99. vllm/core/block/interfaces.py +319 -0
  100. vllm/core/block/naive_block.py +466 -0
  101. vllm/core/block/prefix_caching_block.py +1135 -0
  102. vllm/core/block/utils.py +28 -0
  103. vllm/core/block_manager.py +525 -0
  104. vllm/core/evictor.py +157 -0
  105. vllm/core/interfaces.py +139 -0
  106. vllm/core/placeholder_block_space_manager.py +103 -0
  107. vllm/core/scheduler.py +2126 -0
  108. vllm/device_allocator/__init__.py +0 -0
  109. vllm/device_allocator/cumem.py +281 -0
  110. vllm/distributed/__init__.py +6 -0
  111. vllm/distributed/communication_op.py +41 -0
  112. vllm/distributed/device_communicators/__init__.py +0 -0
  113. vllm/distributed/device_communicators/all2all.py +264 -0
  114. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  115. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  116. vllm/distributed/device_communicators/cuda_communicator.py +194 -0
  117. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  118. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  119. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  120. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  121. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  122. vllm/distributed/device_communicators/pynccl.py +218 -0
  123. vllm/distributed/device_communicators/pynccl_wrapper.py +349 -0
  124. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  125. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  126. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  127. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  128. vllm/distributed/eplb/__init__.py +8 -0
  129. vllm/distributed/eplb/eplb_state.py +432 -0
  130. vllm/distributed/eplb/rebalance_algo.py +234 -0
  131. vllm/distributed/eplb/rebalance_execute.py +307 -0
  132. vllm/distributed/kv_events.py +356 -0
  133. vllm/distributed/kv_transfer/README.md +29 -0
  134. vllm/distributed/kv_transfer/__init__.py +12 -0
  135. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  137. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  138. vllm/distributed/kv_transfer/kv_connector/factory.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  140. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  141. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  142. vllm/distributed/kv_transfer/kv_connector/utils.py +109 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1103 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +485 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +533 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +265 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +389 -0
  153. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  154. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  155. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  156. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  158. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  159. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  160. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  161. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  162. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  163. vllm/distributed/parallel_state.py +1385 -0
  164. vllm/distributed/tpu_distributed_utils.py +178 -0
  165. vllm/distributed/utils.py +536 -0
  166. vllm/engine/__init__.py +0 -0
  167. vllm/engine/arg_utils.py +1801 -0
  168. vllm/engine/async_llm_engine.py +1200 -0
  169. vllm/engine/async_timeout.py +173 -0
  170. vllm/engine/llm_engine.py +2101 -0
  171. vllm/engine/metrics.py +629 -0
  172. vllm/engine/metrics_types.py +94 -0
  173. vllm/engine/multiprocessing/__init__.py +148 -0
  174. vllm/engine/multiprocessing/client.py +681 -0
  175. vllm/engine/multiprocessing/engine.py +460 -0
  176. vllm/engine/output_processor/__init__.py +0 -0
  177. vllm/engine/output_processor/interfaces.py +75 -0
  178. vllm/engine/output_processor/multi_step.py +216 -0
  179. vllm/engine/output_processor/single_step.py +145 -0
  180. vllm/engine/output_processor/stop_checker.py +131 -0
  181. vllm/engine/output_processor/util.py +28 -0
  182. vllm/engine/protocol.py +326 -0
  183. vllm/entrypoints/__init__.py +0 -0
  184. vllm/entrypoints/api_server.py +178 -0
  185. vllm/entrypoints/chat_utils.py +1278 -0
  186. vllm/entrypoints/cli/__init__.py +12 -0
  187. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  188. vllm/entrypoints/cli/benchmark/base.py +25 -0
  189. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  190. vllm/entrypoints/cli/benchmark/main.py +58 -0
  191. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  192. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  193. vllm/entrypoints/cli/collect_env.py +36 -0
  194. vllm/entrypoints/cli/main.py +71 -0
  195. vllm/entrypoints/cli/openai.py +201 -0
  196. vllm/entrypoints/cli/run_batch.py +69 -0
  197. vllm/entrypoints/cli/serve.py +265 -0
  198. vllm/entrypoints/cli/types.py +29 -0
  199. vllm/entrypoints/launcher.py +147 -0
  200. vllm/entrypoints/llm.py +1599 -0
  201. vllm/entrypoints/logger.py +50 -0
  202. vllm/entrypoints/openai/__init__.py +0 -0
  203. vllm/entrypoints/openai/api_server.py +1495 -0
  204. vllm/entrypoints/openai/cli_args.py +331 -0
  205. vllm/entrypoints/openai/logits_processors.py +90 -0
  206. vllm/entrypoints/openai/protocol.py +2096 -0
  207. vllm/entrypoints/openai/run_batch.py +473 -0
  208. vllm/entrypoints/openai/serving_chat.py +1258 -0
  209. vllm/entrypoints/openai/serving_classification.py +160 -0
  210. vllm/entrypoints/openai/serving_completion.py +618 -0
  211. vllm/entrypoints/openai/serving_embedding.py +201 -0
  212. vllm/entrypoints/openai/serving_engine.py +988 -0
  213. vllm/entrypoints/openai/serving_models.py +315 -0
  214. vllm/entrypoints/openai/serving_pooling.py +234 -0
  215. vllm/entrypoints/openai/serving_score.py +431 -0
  216. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  217. vllm/entrypoints/openai/serving_transcription.py +132 -0
  218. vllm/entrypoints/openai/speech_to_text.py +395 -0
  219. vllm/entrypoints/openai/tool_parsers/__init__.py +25 -0
  220. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  221. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  222. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  223. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  224. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  225. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  226. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  227. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  228. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  229. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +369 -0
  230. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  231. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  232. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  233. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  234. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +466 -0
  235. vllm/entrypoints/score_utils.py +50 -0
  236. vllm/entrypoints/ssl.py +75 -0
  237. vllm/entrypoints/utils.py +262 -0
  238. vllm/env_override.py +41 -0
  239. vllm/envs.py +1029 -0
  240. vllm/executor/__init__.py +0 -0
  241. vllm/executor/executor_base.py +401 -0
  242. vllm/executor/mp_distributed_executor.py +244 -0
  243. vllm/executor/msgspec_utils.py +30 -0
  244. vllm/executor/multiproc_worker_utils.py +313 -0
  245. vllm/executor/ray_distributed_executor.py +701 -0
  246. vllm/executor/ray_utils.py +399 -0
  247. vllm/executor/uniproc_executor.py +139 -0
  248. vllm/forward_context.py +185 -0
  249. vllm/inputs/__init__.py +41 -0
  250. vllm/inputs/data.py +331 -0
  251. vllm/inputs/parse.py +151 -0
  252. vllm/inputs/preprocess.py +924 -0
  253. vllm/inputs/registry.py +245 -0
  254. vllm/jsontree.py +80 -0
  255. vllm/logger.py +212 -0
  256. vllm/logging_utils/__init__.py +8 -0
  257. vllm/logging_utils/dump_input.py +81 -0
  258. vllm/logging_utils/formatter.py +18 -0
  259. vllm/logits_process.py +119 -0
  260. vllm/lora/__init__.py +0 -0
  261. vllm/lora/fully_sharded_layers.py +355 -0
  262. vllm/lora/layers.py +1285 -0
  263. vllm/lora/lora.py +199 -0
  264. vllm/lora/models.py +818 -0
  265. vllm/lora/ops/__init__.py +0 -0
  266. vllm/lora/ops/torch_ops/__init__.py +16 -0
  267. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  268. vllm/lora/ops/triton_ops/__init__.py +12 -0
  269. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  270. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  271. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  272. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  273. vllm/lora/ops/triton_ops/utils.py +120 -0
  274. vllm/lora/ops/xla_ops/__init__.py +7 -0
  275. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  276. vllm/lora/peft_helper.py +136 -0
  277. vllm/lora/punica_wrapper/__init__.py +10 -0
  278. vllm/lora/punica_wrapper/punica_base.py +485 -0
  279. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  280. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  281. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  284. vllm/lora/punica_wrapper/utils.py +164 -0
  285. vllm/lora/request.py +99 -0
  286. vllm/lora/resolver.py +85 -0
  287. vllm/lora/utils.py +240 -0
  288. vllm/lora/worker_manager.py +256 -0
  289. vllm/model_executor/__init__.py +16 -0
  290. vllm/model_executor/custom_op.py +208 -0
  291. vllm/model_executor/guided_decoding/__init__.py +181 -0
  292. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  293. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  294. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  295. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  296. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  297. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  298. vllm/model_executor/guided_decoding/utils.py +242 -0
  299. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  300. vllm/model_executor/layers/__init__.py +0 -0
  301. vllm/model_executor/layers/activation.py +420 -0
  302. vllm/model_executor/layers/fused_moe/__init__.py +78 -0
  303. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +298 -0
  304. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +140 -0
  305. vllm/model_executor/layers/fused_moe/config.py +456 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  475. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +215 -0
  476. vllm/model_executor/layers/fused_moe/cutlass_moe.py +645 -0
  477. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +250 -0
  478. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +231 -0
  479. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +183 -0
  480. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1021 -0
  481. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +234 -0
  482. vllm/model_executor/layers/fused_moe/fused_moe.py +1734 -0
  483. vllm/model_executor/layers/fused_moe/layer.py +1528 -0
  484. vllm/model_executor/layers/fused_moe/modular_kernel.py +598 -0
  485. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +224 -0
  486. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  487. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  488. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  489. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +233 -0
  490. vllm/model_executor/layers/fused_moe/prepare_finalize.py +66 -0
  491. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +429 -0
  492. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +136 -0
  493. vllm/model_executor/layers/fused_moe/utils.py +144 -0
  494. vllm/model_executor/layers/layernorm.py +287 -0
  495. vllm/model_executor/layers/lightning_attn.py +652 -0
  496. vllm/model_executor/layers/linear.py +1547 -0
  497. vllm/model_executor/layers/logits_processor.py +197 -0
  498. vllm/model_executor/layers/mamba/__init__.py +0 -0
  499. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  500. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  501. vllm/model_executor/layers/mamba/mamba_mixer2.py +731 -0
  502. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  503. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  504. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  505. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  506. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  507. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  508. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  509. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  510. vllm/model_executor/layers/pooler.py +473 -0
  511. vllm/model_executor/layers/quantization/__init__.py +160 -0
  512. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  513. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  514. vllm/model_executor/layers/quantization/awq.py +228 -0
  515. vllm/model_executor/layers/quantization/awq_marlin.py +523 -0
  516. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  517. vllm/model_executor/layers/quantization/base_config.py +164 -0
  518. vllm/model_executor/layers/quantization/bitblas.py +462 -0
  519. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  520. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  521. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +694 -0
  522. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1613 -0
  523. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  524. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  525. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  526. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  527. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  528. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +149 -0
  529. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  530. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  531. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  532. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  533. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  534. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  535. vllm/model_executor/layers/quantization/deepgemm.py +83 -0
  536. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  537. vllm/model_executor/layers/quantization/experts_int8.py +204 -0
  538. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  539. vllm/model_executor/layers/quantization/fp8.py +950 -0
  540. vllm/model_executor/layers/quantization/gguf.py +577 -0
  541. vllm/model_executor/layers/quantization/gptq.py +278 -0
  542. vllm/model_executor/layers/quantization/gptq_bitblas.py +446 -0
  543. vllm/model_executor/layers/quantization/gptq_marlin.py +679 -0
  544. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  545. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  546. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  547. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  548. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  549. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  550. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  551. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  552. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  553. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +132 -0
  554. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  555. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  556. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  557. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  558. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  559. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  560. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  561. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  562. vllm/model_executor/layers/quantization/marlin.py +263 -0
  563. vllm/model_executor/layers/quantization/modelopt.py +747 -0
  564. vllm/model_executor/layers/quantization/moe_wna16.py +457 -0
  565. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  566. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  567. vllm/model_executor/layers/quantization/qqq.py +275 -0
  568. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  569. vllm/model_executor/layers/quantization/quark/quark.py +437 -0
  570. vllm/model_executor/layers/quantization/quark/quark_moe.py +245 -0
  571. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  572. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  573. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  574. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +157 -0
  575. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  576. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  577. vllm/model_executor/layers/quantization/rtn.py +289 -0
  578. vllm/model_executor/layers/quantization/schema.py +86 -0
  579. vllm/model_executor/layers/quantization/torchao.py +212 -0
  580. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  581. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  582. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  583. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/fp8_utils.py +653 -0
  787. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  788. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  789. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  790. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  791. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  792. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  793. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  794. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  795. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  796. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  797. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  798. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +146 -0
  799. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  800. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  801. vllm/model_executor/layers/rejection_sampler.py +406 -0
  802. vllm/model_executor/layers/resampler.py +270 -0
  803. vllm/model_executor/layers/rotary_embedding.py +2025 -0
  804. vllm/model_executor/layers/sampler.py +1204 -0
  805. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  806. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  807. vllm/model_executor/layers/utils.py +116 -0
  808. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  809. vllm/model_executor/model_loader/__init__.py +77 -0
  810. vllm/model_executor/model_loader/base_loader.py +43 -0
  811. vllm/model_executor/model_loader/bitsandbytes_loader.py +613 -0
  812. vllm/model_executor/model_loader/default_loader.py +282 -0
  813. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  814. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  815. vllm/model_executor/model_loader/neuron.py +476 -0
  816. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  817. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  818. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  819. vllm/model_executor/model_loader/tensorizer.py +602 -0
  820. vllm/model_executor/model_loader/tensorizer_loader.py +127 -0
  821. vllm/model_executor/model_loader/tpu.py +113 -0
  822. vllm/model_executor/model_loader/utils.py +315 -0
  823. vllm/model_executor/model_loader/weight_utils.py +782 -0
  824. vllm/model_executor/models/__init__.py +30 -0
  825. vllm/model_executor/models/adapters.py +375 -0
  826. vllm/model_executor/models/aimv2.py +246 -0
  827. vllm/model_executor/models/arctic.py +559 -0
  828. vllm/model_executor/models/aria.py +670 -0
  829. vllm/model_executor/models/aya_vision.py +486 -0
  830. vllm/model_executor/models/baichuan.py +474 -0
  831. vllm/model_executor/models/bamba.py +558 -0
  832. vllm/model_executor/models/bart.py +938 -0
  833. vllm/model_executor/models/bert.py +513 -0
  834. vllm/model_executor/models/bert_with_rope.py +617 -0
  835. vllm/model_executor/models/blip.py +339 -0
  836. vllm/model_executor/models/blip2.py +728 -0
  837. vllm/model_executor/models/bloom.py +373 -0
  838. vllm/model_executor/models/chameleon.py +1146 -0
  839. vllm/model_executor/models/chatglm.py +478 -0
  840. vllm/model_executor/models/clip.py +407 -0
  841. vllm/model_executor/models/commandr.py +471 -0
  842. vllm/model_executor/models/config.py +200 -0
  843. vllm/model_executor/models/constant_size_cache.py +137 -0
  844. vllm/model_executor/models/dbrx.py +472 -0
  845. vllm/model_executor/models/deepseek.py +486 -0
  846. vllm/model_executor/models/deepseek_mtp.py +281 -0
  847. vllm/model_executor/models/deepseek_v2.py +935 -0
  848. vllm/model_executor/models/deepseek_vl2.py +660 -0
  849. vllm/model_executor/models/dots1.py +536 -0
  850. vllm/model_executor/models/eagle.py +261 -0
  851. vllm/model_executor/models/ernie45.py +43 -0
  852. vllm/model_executor/models/ernie45_moe.py +583 -0
  853. vllm/model_executor/models/exaone.py +551 -0
  854. vllm/model_executor/models/fairseq2_llama.py +154 -0
  855. vllm/model_executor/models/falcon.py +510 -0
  856. vllm/model_executor/models/falcon_h1.py +708 -0
  857. vllm/model_executor/models/florence2.py +1113 -0
  858. vllm/model_executor/models/fuyu.py +406 -0
  859. vllm/model_executor/models/gemma.py +427 -0
  860. vllm/model_executor/models/gemma2.py +427 -0
  861. vllm/model_executor/models/gemma3.py +535 -0
  862. vllm/model_executor/models/gemma3_mm.py +729 -0
  863. vllm/model_executor/models/gemma3n.py +811 -0
  864. vllm/model_executor/models/glm.py +23 -0
  865. vllm/model_executor/models/glm4.py +305 -0
  866. vllm/model_executor/models/glm4_1v.py +1590 -0
  867. vllm/model_executor/models/glm4v.py +657 -0
  868. vllm/model_executor/models/gpt2.py +382 -0
  869. vllm/model_executor/models/gpt_bigcode.py +335 -0
  870. vllm/model_executor/models/gpt_j.py +339 -0
  871. vllm/model_executor/models/gpt_neox.py +332 -0
  872. vllm/model_executor/models/granite.py +493 -0
  873. vllm/model_executor/models/granite_speech.py +790 -0
  874. vllm/model_executor/models/granitemoe.py +437 -0
  875. vllm/model_executor/models/granitemoehybrid.py +653 -0
  876. vllm/model_executor/models/granitemoeshared.py +341 -0
  877. vllm/model_executor/models/gritlm.py +224 -0
  878. vllm/model_executor/models/grok1.py +546 -0
  879. vllm/model_executor/models/h2ovl.py +549 -0
  880. vllm/model_executor/models/hunyuan_v1_moe.py +897 -0
  881. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  882. vllm/model_executor/models/idefics3.py +786 -0
  883. vllm/model_executor/models/interfaces.py +681 -0
  884. vllm/model_executor/models/interfaces_base.py +164 -0
  885. vllm/model_executor/models/intern_vit.py +480 -0
  886. vllm/model_executor/models/internlm2.py +455 -0
  887. vllm/model_executor/models/internlm2_ve.py +147 -0
  888. vllm/model_executor/models/internvl.py +1432 -0
  889. vllm/model_executor/models/jais.py +373 -0
  890. vllm/model_executor/models/jamba.py +592 -0
  891. vllm/model_executor/models/keye.py +1736 -0
  892. vllm/model_executor/models/kimi_vl.py +585 -0
  893. vllm/model_executor/models/llama.py +644 -0
  894. vllm/model_executor/models/llama4.py +531 -0
  895. vllm/model_executor/models/llama_eagle.py +165 -0
  896. vllm/model_executor/models/llama_eagle3.py +263 -0
  897. vllm/model_executor/models/llava.py +887 -0
  898. vllm/model_executor/models/llava_next.py +604 -0
  899. vllm/model_executor/models/llava_next_video.py +492 -0
  900. vllm/model_executor/models/llava_onevision.py +985 -0
  901. vllm/model_executor/models/mamba.py +273 -0
  902. vllm/model_executor/models/mamba2.py +320 -0
  903. vllm/model_executor/models/mamba_cache.py +76 -0
  904. vllm/model_executor/models/medusa.py +219 -0
  905. vllm/model_executor/models/mimo.py +192 -0
  906. vllm/model_executor/models/mimo_mtp.py +285 -0
  907. vllm/model_executor/models/minicpm.py +592 -0
  908. vllm/model_executor/models/minicpm3.py +230 -0
  909. vllm/model_executor/models/minicpm_eagle.py +391 -0
  910. vllm/model_executor/models/minicpmo.py +772 -0
  911. vllm/model_executor/models/minicpmv.py +1307 -0
  912. vllm/model_executor/models/minimax_cache.py +36 -0
  913. vllm/model_executor/models/minimax_text_01.py +1301 -0
  914. vllm/model_executor/models/minimax_vl_01.py +374 -0
  915. vllm/model_executor/models/mistral3.py +624 -0
  916. vllm/model_executor/models/mixtral.py +488 -0
  917. vllm/model_executor/models/mixtral_quant.py +453 -0
  918. vllm/model_executor/models/mllama.py +1682 -0
  919. vllm/model_executor/models/mllama4.py +947 -0
  920. vllm/model_executor/models/mlp_speculator.py +206 -0
  921. vllm/model_executor/models/modernbert.py +339 -0
  922. vllm/model_executor/models/module_mapping.py +72 -0
  923. vllm/model_executor/models/molmo.py +1576 -0
  924. vllm/model_executor/models/moonvit.py +630 -0
  925. vllm/model_executor/models/mpt.py +331 -0
  926. vllm/model_executor/models/nemotron.py +508 -0
  927. vllm/model_executor/models/nemotron_h.py +588 -0
  928. vllm/model_executor/models/nemotron_nas.py +484 -0
  929. vllm/model_executor/models/nvlm_d.py +216 -0
  930. vllm/model_executor/models/olmo.py +389 -0
  931. vllm/model_executor/models/olmo2.py +414 -0
  932. vllm/model_executor/models/olmoe.py +468 -0
  933. vllm/model_executor/models/opt.py +412 -0
  934. vllm/model_executor/models/orion.py +349 -0
  935. vllm/model_executor/models/ovis.py +577 -0
  936. vllm/model_executor/models/paligemma.py +419 -0
  937. vllm/model_executor/models/persimmon.py +344 -0
  938. vllm/model_executor/models/phi.py +356 -0
  939. vllm/model_executor/models/phi3.py +19 -0
  940. vllm/model_executor/models/phi3_small.py +465 -0
  941. vllm/model_executor/models/phi3v.py +733 -0
  942. vllm/model_executor/models/phi4mm.py +1258 -0
  943. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  944. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  945. vllm/model_executor/models/phimoe.py +674 -0
  946. vllm/model_executor/models/pixtral.py +1329 -0
  947. vllm/model_executor/models/plamo2.py +738 -0
  948. vllm/model_executor/models/prithvi_geospatial_mae.py +240 -0
  949. vllm/model_executor/models/qwen.py +362 -0
  950. vllm/model_executor/models/qwen2.py +501 -0
  951. vllm/model_executor/models/qwen2_5_omni_thinker.py +923 -0
  952. vllm/model_executor/models/qwen2_5_vl.py +1175 -0
  953. vllm/model_executor/models/qwen2_audio.py +420 -0
  954. vllm/model_executor/models/qwen2_moe.py +540 -0
  955. vllm/model_executor/models/qwen2_rm.py +122 -0
  956. vllm/model_executor/models/qwen2_vl.py +1513 -0
  957. vllm/model_executor/models/qwen3.py +325 -0
  958. vllm/model_executor/models/qwen3_moe.py +541 -0
  959. vllm/model_executor/models/qwen_vl.py +796 -0
  960. vllm/model_executor/models/registry.py +634 -0
  961. vllm/model_executor/models/roberta.py +271 -0
  962. vllm/model_executor/models/siglip.py +524 -0
  963. vllm/model_executor/models/skyworkr1v.py +961 -0
  964. vllm/model_executor/models/smolvlm.py +52 -0
  965. vllm/model_executor/models/solar.py +506 -0
  966. vllm/model_executor/models/stablelm.py +343 -0
  967. vllm/model_executor/models/starcoder2.py +356 -0
  968. vllm/model_executor/models/tarsier.py +652 -0
  969. vllm/model_executor/models/telechat2.py +140 -0
  970. vllm/model_executor/models/teleflm.py +79 -0
  971. vllm/model_executor/models/transformers.py +509 -0
  972. vllm/model_executor/models/ultravox.py +670 -0
  973. vllm/model_executor/models/utils.py +744 -0
  974. vllm/model_executor/models/vision.py +147 -0
  975. vllm/model_executor/models/whisper.py +886 -0
  976. vllm/model_executor/models/zamba2.py +1036 -0
  977. vllm/model_executor/parameter.py +459 -0
  978. vllm/model_executor/pooling_metadata.py +72 -0
  979. vllm/model_executor/sampling_metadata.py +597 -0
  980. vllm/model_executor/utils.py +80 -0
  981. vllm/multimodal/__init__.py +33 -0
  982. vllm/multimodal/audio.py +116 -0
  983. vllm/multimodal/base.py +219 -0
  984. vllm/multimodal/hasher.py +91 -0
  985. vllm/multimodal/image.py +103 -0
  986. vllm/multimodal/inputs.py +878 -0
  987. vllm/multimodal/parse.py +499 -0
  988. vllm/multimodal/processing.py +1948 -0
  989. vllm/multimodal/profiling.py +283 -0
  990. vllm/multimodal/registry.py +331 -0
  991. vllm/multimodal/utils.py +492 -0
  992. vllm/multimodal/video.py +227 -0
  993. vllm/outputs.py +516 -0
  994. vllm/platforms/__init__.py +291 -0
  995. vllm/platforms/cpu.py +281 -0
  996. vllm/platforms/cuda.py +568 -0
  997. vllm/platforms/hpu.py +106 -0
  998. vllm/platforms/interface.py +551 -0
  999. vllm/platforms/neuron.py +150 -0
  1000. vllm/platforms/rocm.py +453 -0
  1001. vllm/platforms/tpu.py +206 -0
  1002. vllm/platforms/xpu.py +192 -0
  1003. vllm/plugins/__init__.py +94 -0
  1004. vllm/plugins/lora_resolvers/README.md +15 -0
  1005. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1006. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1007. vllm/pooling_params.py +64 -0
  1008. vllm/profiler/__init__.py +0 -0
  1009. vllm/profiler/layerwise_profile.py +375 -0
  1010. vllm/profiler/utils.py +148 -0
  1011. vllm/prompt_adapter/__init__.py +0 -0
  1012. vllm/prompt_adapter/layers.py +83 -0
  1013. vllm/prompt_adapter/models.py +358 -0
  1014. vllm/prompt_adapter/request.py +37 -0
  1015. vllm/prompt_adapter/utils.py +98 -0
  1016. vllm/prompt_adapter/worker_manager.py +179 -0
  1017. vllm/py.typed +2 -0
  1018. vllm/reasoning/__init__.py +15 -0
  1019. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  1020. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1021. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1022. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1023. vllm/sampling_params.py +602 -0
  1024. vllm/scalar_type.py +347 -0
  1025. vllm/scripts.py +15 -0
  1026. vllm/sequence.py +1568 -0
  1027. vllm/spec_decode/__init__.py +0 -0
  1028. vllm/spec_decode/batch_expansion.py +506 -0
  1029. vllm/spec_decode/draft_model_runner.py +349 -0
  1030. vllm/spec_decode/interfaces.py +99 -0
  1031. vllm/spec_decode/medusa_worker.py +138 -0
  1032. vllm/spec_decode/metrics.py +213 -0
  1033. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1034. vllm/spec_decode/mqa_scorer.py +160 -0
  1035. vllm/spec_decode/multi_step_worker.py +423 -0
  1036. vllm/spec_decode/ngram_worker.py +196 -0
  1037. vllm/spec_decode/proposer_worker_base.py +59 -0
  1038. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1039. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1040. vllm/spec_decode/target_model_runner.py +45 -0
  1041. vllm/spec_decode/top1_proposer.py +275 -0
  1042. vllm/spec_decode/util.py +277 -0
  1043. vllm/test_utils.py +130 -0
  1044. vllm/third_party/__init__.py +0 -0
  1045. vllm/third_party/pynvml.py +6140 -0
  1046. vllm/tracing.py +131 -0
  1047. vllm/transformers_utils/__init__.py +24 -0
  1048. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1049. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1050. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1051. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1052. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1053. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1054. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1055. vllm/transformers_utils/config.py +922 -0
  1056. vllm/transformers_utils/configs/__init__.py +57 -0
  1057. vllm/transformers_utils/configs/arctic.py +207 -0
  1058. vllm/transformers_utils/configs/chatglm.py +72 -0
  1059. vllm/transformers_utils/configs/cohere2.py +195 -0
  1060. vllm/transformers_utils/configs/dbrx.py +280 -0
  1061. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1062. vllm/transformers_utils/configs/eagle.py +85 -0
  1063. vllm/transformers_utils/configs/exaone.py +190 -0
  1064. vllm/transformers_utils/configs/falcon.py +90 -0
  1065. vllm/transformers_utils/configs/jais.py +238 -0
  1066. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1067. vllm/transformers_utils/configs/medusa.py +63 -0
  1068. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1069. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1070. vllm/transformers_utils/configs/mllama.py +31 -0
  1071. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1072. vllm/transformers_utils/configs/moonvit.py +33 -0
  1073. vllm/transformers_utils/configs/mpt.py +180 -0
  1074. vllm/transformers_utils/configs/nemotron.py +205 -0
  1075. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1076. vllm/transformers_utils/configs/nvlm_d.py +31 -0
  1077. vllm/transformers_utils/configs/ovis.py +184 -0
  1078. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1079. vllm/transformers_utils/configs/solar.py +247 -0
  1080. vllm/transformers_utils/configs/telechat2.py +64 -0
  1081. vllm/transformers_utils/configs/ultravox.py +108 -0
  1082. vllm/transformers_utils/detokenizer.py +168 -0
  1083. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1084. vllm/transformers_utils/processor.py +221 -0
  1085. vllm/transformers_utils/processors/__init__.py +8 -0
  1086. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1087. vllm/transformers_utils/processors/ovis.py +420 -0
  1088. vllm/transformers_utils/s3_utils.py +162 -0
  1089. vllm/transformers_utils/tokenizer.py +302 -0
  1090. vllm/transformers_utils/tokenizer_base.py +149 -0
  1091. vllm/transformers_utils/tokenizer_group.py +120 -0
  1092. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1093. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1094. vllm/transformers_utils/utils.py +99 -0
  1095. vllm/triton_utils/__init__.py +14 -0
  1096. vllm/triton_utils/importing.py +94 -0
  1097. vllm/usage/__init__.py +0 -0
  1098. vllm/usage/usage_lib.py +259 -0
  1099. vllm/utils/__init__.py +3008 -0
  1100. vllm/v1/__init__.py +0 -0
  1101. vllm/v1/attention/__init__.py +0 -0
  1102. vllm/v1/attention/backends/__init__.py +0 -0
  1103. vllm/v1/attention/backends/cpu_attn.py +184 -0
  1104. vllm/v1/attention/backends/flash_attn.py +757 -0
  1105. vllm/v1/attention/backends/flashinfer.py +680 -0
  1106. vllm/v1/attention/backends/flex_attention.py +491 -0
  1107. vllm/v1/attention/backends/mamba_attn.py +192 -0
  1108. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1109. vllm/v1/attention/backends/mla/common.py +978 -0
  1110. vllm/v1/attention/backends/mla/cutlass_mla.py +98 -0
  1111. vllm/v1/attention/backends/mla/flashmla.py +180 -0
  1112. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +241 -0
  1113. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1114. vllm/v1/attention/backends/pallas.py +320 -0
  1115. vllm/v1/attention/backends/rocm_aiter_fa.py +609 -0
  1116. vllm/v1/attention/backends/triton_attn.py +449 -0
  1117. vllm/v1/attention/backends/utils.py +310 -0
  1118. vllm/v1/core/__init__.py +0 -0
  1119. vllm/v1/core/block_pool.py +349 -0
  1120. vllm/v1/core/encoder_cache_manager.py +254 -0
  1121. vllm/v1/core/kv_cache_coordinator.py +369 -0
  1122. vllm/v1/core/kv_cache_manager.py +398 -0
  1123. vllm/v1/core/kv_cache_utils.py +999 -0
  1124. vllm/v1/core/sched/__init__.py +0 -0
  1125. vllm/v1/core/sched/interface.py +150 -0
  1126. vllm/v1/core/sched/output.py +157 -0
  1127. vllm/v1/core/sched/request_queue.py +224 -0
  1128. vllm/v1/core/sched/scheduler.py +1115 -0
  1129. vllm/v1/core/sched/utils.py +36 -0
  1130. vllm/v1/core/single_type_kv_cache_manager.py +444 -0
  1131. vllm/v1/engine/__init__.py +179 -0
  1132. vllm/v1/engine/async_llm.py +626 -0
  1133. vllm/v1/engine/coordinator.py +278 -0
  1134. vllm/v1/engine/core.py +1046 -0
  1135. vllm/v1/engine/core_client.py +1049 -0
  1136. vllm/v1/engine/detokenizer.py +292 -0
  1137. vllm/v1/engine/exceptions.py +17 -0
  1138. vllm/v1/engine/llm_engine.py +322 -0
  1139. vllm/v1/engine/logprobs.py +200 -0
  1140. vllm/v1/engine/mm_input_cache.py +91 -0
  1141. vllm/v1/engine/output_processor.py +477 -0
  1142. vllm/v1/engine/parallel_sampling.py +133 -0
  1143. vllm/v1/engine/processor.py +422 -0
  1144. vllm/v1/engine/utils.py +546 -0
  1145. vllm/v1/executor/__init__.py +0 -0
  1146. vllm/v1/executor/abstract.py +113 -0
  1147. vllm/v1/executor/multiproc_executor.py +532 -0
  1148. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1149. vllm/v1/kv_cache_interface.py +223 -0
  1150. vllm/v1/metrics/__init__.py +0 -0
  1151. vllm/v1/metrics/loggers.py +557 -0
  1152. vllm/v1/metrics/prometheus.py +82 -0
  1153. vllm/v1/metrics/ray_wrappers.py +131 -0
  1154. vllm/v1/metrics/reader.py +246 -0
  1155. vllm/v1/metrics/stats.py +240 -0
  1156. vllm/v1/outputs.py +124 -0
  1157. vllm/v1/pool/__init__.py +0 -0
  1158. vllm/v1/pool/metadata.py +17 -0
  1159. vllm/v1/request.py +229 -0
  1160. vllm/v1/sample/__init__.py +0 -0
  1161. vllm/v1/sample/logits_processor.py +517 -0
  1162. vllm/v1/sample/metadata.py +43 -0
  1163. vllm/v1/sample/ops/__init__.py +0 -0
  1164. vllm/v1/sample/ops/bad_words.py +39 -0
  1165. vllm/v1/sample/ops/penalties.py +43 -0
  1166. vllm/v1/sample/ops/topk_topp_sampler.py +296 -0
  1167. vllm/v1/sample/rejection_sampler.py +631 -0
  1168. vllm/v1/sample/sampler.py +226 -0
  1169. vllm/v1/sample/tpu/__init__.py +0 -0
  1170. vllm/v1/sample/tpu/metadata.py +124 -0
  1171. vllm/v1/sample/tpu/sampler.py +145 -0
  1172. vllm/v1/serial_utils.py +315 -0
  1173. vllm/v1/spec_decode/__init__.py +0 -0
  1174. vllm/v1/spec_decode/eagle.py +441 -0
  1175. vllm/v1/spec_decode/medusa.py +64 -0
  1176. vllm/v1/spec_decode/metadata.py +62 -0
  1177. vllm/v1/spec_decode/metrics.py +178 -0
  1178. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1179. vllm/v1/spec_decode/utils.py +41 -0
  1180. vllm/v1/structured_output/__init__.py +227 -0
  1181. vllm/v1/structured_output/backend_guidance.py +245 -0
  1182. vllm/v1/structured_output/backend_types.py +134 -0
  1183. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1184. vllm/v1/structured_output/request.py +86 -0
  1185. vllm/v1/structured_output/utils.py +175 -0
  1186. vllm/v1/utils.py +377 -0
  1187. vllm/v1/worker/__init__.py +0 -0
  1188. vllm/v1/worker/block_table.py +142 -0
  1189. vllm/v1/worker/cpu_model_runner.py +91 -0
  1190. vllm/v1/worker/cpu_worker.py +153 -0
  1191. vllm/v1/worker/gpu_input_batch.py +757 -0
  1192. vllm/v1/worker/gpu_model_runner.py +2739 -0
  1193. vllm/v1/worker/gpu_worker.py +408 -0
  1194. vllm/v1/worker/lora_model_runner_mixin.py +177 -0
  1195. vllm/v1/worker/tpu_input_batch.py +585 -0
  1196. vllm/v1/worker/tpu_model_runner.py +1849 -0
  1197. vllm/v1/worker/tpu_worker.py +315 -0
  1198. vllm/v1/worker/utils.py +112 -0
  1199. vllm/v1/worker/worker_base.py +65 -0
  1200. vllm/v1/worker/xpu_model_runner.py +33 -0
  1201. vllm/v1/worker/xpu_worker.py +165 -0
  1202. vllm/version.py +41 -0
  1203. vllm/vllm_flash_attn/.gitkeep +0 -0
  1204. vllm/worker/__init__.py +0 -0
  1205. vllm/worker/cache_engine.py +145 -0
  1206. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1207. vllm/worker/cpu_model_runner.py +671 -0
  1208. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1209. vllm/worker/cpu_worker.py +452 -0
  1210. vllm/worker/enc_dec_model_runner.py +555 -0
  1211. vllm/worker/hpu_model_runner.py +2320 -0
  1212. vllm/worker/hpu_worker.py +484 -0
  1213. vllm/worker/model_runner.py +2178 -0
  1214. vllm/worker/model_runner_base.py +282 -0
  1215. vllm/worker/multi_step_hpu_worker.py +123 -0
  1216. vllm/worker/multi_step_model_runner.py +911 -0
  1217. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1218. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1219. vllm/worker/multi_step_tpu_worker.py +108 -0
  1220. vllm/worker/multi_step_worker.py +197 -0
  1221. vllm/worker/neuron_model_runner.py +460 -0
  1222. vllm/worker/neuron_worker.py +193 -0
  1223. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1224. vllm/worker/pooling_model_runner.py +211 -0
  1225. vllm/worker/tpu_model_runner.py +909 -0
  1226. vllm/worker/tpu_worker.py +337 -0
  1227. vllm/worker/utils.py +53 -0
  1228. vllm/worker/worker.py +577 -0
  1229. vllm/worker/worker_base.py +646 -0
  1230. vllm/worker/xpu_model_runner.py +606 -0
  1231. vllm/worker/xpu_worker.py +186 -0
  1232. vllm_cpu-0.9.2.post2.dist-info/METADATA +339 -0
  1233. vllm_cpu-0.9.2.post2.dist-info/RECORD +1236 -0
  1234. vllm_cpu-0.9.2.post2.dist-info/WHEEL +5 -0
  1235. vllm_cpu-0.9.2.post2.dist-info/entry_points.txt +5 -0
  1236. vllm_cpu-0.9.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2096 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Adapted from
5
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
6
+ import json
7
+ import time
8
+ from http import HTTPStatus
9
+ from typing import Annotated, Any, ClassVar, Literal, Optional, Union
10
+
11
+ import regex as re
12
+ import torch
13
+ from fastapi import HTTPException, UploadFile
14
+ from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
15
+ ValidationInfo, field_validator, model_validator)
16
+ from typing_extensions import TypeAlias
17
+
18
+ from vllm import envs
19
+ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
20
+ random_tool_call_id)
21
+ from vllm.logger import init_logger
22
+ from vllm.pooling_params import PoolingParams
23
+ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
24
+ RequestOutputKind, SamplingParams)
25
+ from vllm.sequence import Logprob
26
+ from vllm.utils import random_uuid, resolve_obj_by_qualname
27
+
28
+ logger = init_logger(__name__)
29
+
30
+ _LONG_INFO = torch.iinfo(torch.long)
31
+
32
+
33
+ class OpenAIBaseModel(BaseModel):
34
+ # OpenAI API does allow extra fields
35
+ model_config = ConfigDict(extra="allow")
36
+
37
+ # Cache class field names
38
+ field_names: ClassVar[Optional[set[str]]] = None
39
+
40
+ @model_validator(mode="wrap")
41
+ @classmethod
42
+ def __log_extra_fields__(cls, data, handler):
43
+ result = handler(data)
44
+ if not isinstance(data, dict):
45
+ return result
46
+ field_names = cls.field_names
47
+ if field_names is None:
48
+ # Get all class field names and their potential aliases
49
+ field_names = set()
50
+ for field_name, field in cls.model_fields.items():
51
+ field_names.add(field_name)
52
+ if alias := getattr(field, "alias", None):
53
+ field_names.add(alias)
54
+ cls.field_names = field_names
55
+
56
+ # Compare against both field names and aliases
57
+ if any(k not in field_names for k in data):
58
+ logger.warning(
59
+ "The following fields were present in the request "
60
+ "but ignored: %s",
61
+ data.keys() - field_names,
62
+ )
63
+ return result
64
+
65
+
66
+ class ErrorResponse(OpenAIBaseModel):
67
+ object: str = "error"
68
+ message: str
69
+ type: str
70
+ param: Optional[str] = None
71
+ code: int
72
+
73
+
74
+ class ModelPermission(OpenAIBaseModel):
75
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
76
+ object: str = "model_permission"
77
+ created: int = Field(default_factory=lambda: int(time.time()))
78
+ allow_create_engine: bool = False
79
+ allow_sampling: bool = True
80
+ allow_logprobs: bool = True
81
+ allow_search_indices: bool = False
82
+ allow_view: bool = True
83
+ allow_fine_tuning: bool = False
84
+ organization: str = "*"
85
+ group: Optional[str] = None
86
+ is_blocking: bool = False
87
+
88
+
89
+ class ModelCard(OpenAIBaseModel):
90
+ id: str
91
+ object: str = "model"
92
+ created: int = Field(default_factory=lambda: int(time.time()))
93
+ owned_by: str = "vllm"
94
+ root: Optional[str] = None
95
+ parent: Optional[str] = None
96
+ max_model_len: Optional[int] = None
97
+ permission: list[ModelPermission] = Field(default_factory=list)
98
+
99
+
100
+ class ModelList(OpenAIBaseModel):
101
+ object: str = "list"
102
+ data: list[ModelCard] = Field(default_factory=list)
103
+
104
+
105
+ class PromptTokenUsageInfo(OpenAIBaseModel):
106
+ cached_tokens: Optional[int] = None
107
+
108
+
109
+ class UsageInfo(OpenAIBaseModel):
110
+ prompt_tokens: int = 0
111
+ total_tokens: int = 0
112
+ completion_tokens: Optional[int] = 0
113
+ prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
114
+
115
+
116
+ class RequestResponseMetadata(BaseModel):
117
+ request_id: str
118
+ final_usage_info: Optional[UsageInfo] = None
119
+
120
+
121
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
122
+ name: str
123
+ description: Optional[str] = None
124
+ # schema is the field in openai but that causes conflicts with pydantic so
125
+ # instead use json_schema with an alias
126
+ json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
127
+ strict: Optional[bool] = None
128
+
129
+
130
+ class StructuralTag(OpenAIBaseModel):
131
+ begin: str
132
+ # schema is the field, but that causes conflicts with pydantic so
133
+ # instead use structural_tag_schema with an alias
134
+ structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
135
+ alias="schema")
136
+ end: str
137
+
138
+
139
+ class StructuralTagResponseFormat(OpenAIBaseModel):
140
+ type: Literal["structural_tag"]
141
+ structures: list[StructuralTag]
142
+ triggers: list[str]
143
+
144
+
145
+ class ResponseFormat(OpenAIBaseModel):
146
+ # type must be "json_schema", "json_object", or "text"
147
+ type: Literal["text", "json_object", "json_schema"]
148
+ json_schema: Optional[JsonSchemaResponseFormat] = None
149
+
150
+
151
+ AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
152
+
153
+
154
+ class StreamOptions(OpenAIBaseModel):
155
+ include_usage: Optional[bool] = True
156
+ continuous_usage_stats: Optional[bool] = False
157
+
158
+
159
+ class FunctionDefinition(OpenAIBaseModel):
160
+ name: str
161
+ description: Optional[str] = None
162
+ parameters: Optional[dict[str, Any]] = None
163
+
164
+
165
+ class ChatCompletionToolsParam(OpenAIBaseModel):
166
+ type: Literal["function"] = "function"
167
+ function: FunctionDefinition
168
+
169
+
170
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
171
+ name: str
172
+
173
+
174
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
175
+ function: ChatCompletionNamedFunction
176
+ type: Literal["function"] = "function"
177
+
178
+
179
+ # extra="forbid" is a workaround to have kwargs as a field,
180
+ # see https://github.com/pydantic/pydantic/issues/3125
181
+ class LogitsProcessorConstructor(BaseModel):
182
+ qualname: str
183
+ args: Optional[list[Any]] = None
184
+ kwargs: Optional[dict[str, Any]] = None
185
+
186
+ model_config = ConfigDict(extra="forbid")
187
+
188
+
189
+ LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
190
+
191
+
192
+ def get_logits_processors(processors: Optional[LogitsProcessors],
193
+ pattern: Optional[str]) -> Optional[list[Any]]:
194
+ if processors and pattern:
195
+ logits_processors = []
196
+ for processor in processors:
197
+ qualname = processor if isinstance(processor,
198
+ str) else processor.qualname
199
+ if not re.match(pattern, qualname):
200
+ raise ValueError(
201
+ f"Logits processor '{qualname}' is not allowed by this "
202
+ "server. See --logits-processor-pattern engine argument "
203
+ "for more information.")
204
+ try:
205
+ logits_processor = resolve_obj_by_qualname(qualname)
206
+ except Exception as e:
207
+ raise ValueError(
208
+ f"Logits processor '{qualname}' could not be resolved: {e}"
209
+ ) from e
210
+ if isinstance(processor, LogitsProcessorConstructor):
211
+ logits_processor = logits_processor(*processor.args or [],
212
+ **processor.kwargs or {})
213
+ logits_processors.append(logits_processor)
214
+ return logits_processors
215
+ elif processors:
216
+ raise ValueError(
217
+ "The `logits_processors` argument is not supported by this "
218
+ "server. See --logits-processor-pattern engine argugment "
219
+ "for more information.")
220
+ return None
221
+
222
+
223
+ class ChatCompletionRequest(OpenAIBaseModel):
224
+ # Ordered by official OpenAI API documentation
225
+ # https://platform.openai.com/docs/api-reference/chat/create
226
+ messages: list[ChatCompletionMessageParam]
227
+ model: Optional[str] = None
228
+ frequency_penalty: Optional[float] = 0.0
229
+ logit_bias: Optional[dict[str, float]] = None
230
+ logprobs: Optional[bool] = False
231
+ top_logprobs: Optional[int] = 0
232
+ max_tokens: Optional[int] = Field(
233
+ default=None,
234
+ deprecated=
235
+ 'max_tokens is deprecated in favor of the max_completion_tokens field')
236
+ max_completion_tokens: Optional[int] = None
237
+ n: Optional[int] = 1
238
+ presence_penalty: Optional[float] = 0.0
239
+ response_format: Optional[AnyResponseFormat] = None
240
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
241
+ stop: Optional[Union[str, list[str]]] = []
242
+ stream: Optional[bool] = False
243
+ stream_options: Optional[StreamOptions] = None
244
+ temperature: Optional[float] = None
245
+ top_p: Optional[float] = None
246
+ tools: Optional[list[ChatCompletionToolsParam]] = None
247
+ tool_choice: Optional[Union[
248
+ Literal["none"],
249
+ Literal["auto"],
250
+ Literal["required"],
251
+ ChatCompletionNamedToolChoiceParam,
252
+ ]] = "none"
253
+
254
+ # NOTE this will be ignored by vLLM -- the model determines the behavior
255
+ parallel_tool_calls: Optional[bool] = False
256
+ user: Optional[str] = None
257
+
258
+ # --8<-- [start:chat-completion-sampling-params]
259
+ best_of: Optional[int] = None
260
+ use_beam_search: bool = False
261
+ top_k: Optional[int] = None
262
+ min_p: Optional[float] = None
263
+ repetition_penalty: Optional[float] = None
264
+ length_penalty: float = 1.0
265
+ stop_token_ids: Optional[list[int]] = []
266
+ include_stop_str_in_output: bool = False
267
+ ignore_eos: bool = False
268
+ min_tokens: int = 0
269
+ skip_special_tokens: bool = True
270
+ spaces_between_special_tokens: bool = True
271
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
272
+ prompt_logprobs: Optional[int] = None
273
+ allowed_token_ids: Optional[list[int]] = None
274
+ bad_words: list[str] = Field(default_factory=list)
275
+ # --8<-- [end:chat-completion-sampling-params]
276
+
277
+ # --8<-- [start:chat-completion-extra-params]
278
+ echo: bool = Field(
279
+ default=False,
280
+ description=(
281
+ "If true, the new message will be prepended with the last message "
282
+ "if they belong to the same role."),
283
+ )
284
+ add_generation_prompt: bool = Field(
285
+ default=True,
286
+ description=
287
+ ("If true, the generation prompt will be added to the chat template. "
288
+ "This is a parameter used by chat template in tokenizer config of the "
289
+ "model."),
290
+ )
291
+ continue_final_message: bool = Field(
292
+ default=False,
293
+ description=
294
+ ("If this is set, the chat will be formatted so that the final "
295
+ "message in the chat is open-ended, without any EOS tokens. The "
296
+ "model will continue this message rather than starting a new one. "
297
+ "This allows you to \"prefill\" part of the model's response for it. "
298
+ "Cannot be used at the same time as `add_generation_prompt`."),
299
+ )
300
+ add_special_tokens: bool = Field(
301
+ default=False,
302
+ description=(
303
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
304
+ "on top of what is added by the chat template. "
305
+ "For most models, the chat template takes care of adding the "
306
+ "special tokens so this should be set to false (as is the "
307
+ "default)."),
308
+ )
309
+ documents: Optional[list[dict[str, str]]] = Field(
310
+ default=None,
311
+ description=
312
+ ("A list of dicts representing documents that will be accessible to "
313
+ "the model if it is performing RAG (retrieval-augmented generation)."
314
+ " If the template does not support RAG, this argument will have no "
315
+ "effect. We recommend that each document should be a dict containing "
316
+ "\"title\" and \"text\" keys."),
317
+ )
318
+ chat_template: Optional[str] = Field(
319
+ default=None,
320
+ description=(
321
+ "A Jinja template to use for this conversion. "
322
+ "As of transformers v4.44, default chat template is no longer "
323
+ "allowed, so you must provide a chat template if the tokenizer "
324
+ "does not define one."),
325
+ )
326
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
327
+ default=None,
328
+ description=(
329
+ "Additional keyword args to pass to the template renderer. "
330
+ "Will be accessible by the chat template."),
331
+ )
332
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
333
+ default=None,
334
+ description=("Additional kwargs to pass to the HF processor."),
335
+ )
336
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
337
+ default=None,
338
+ description=("If specified, the output will follow the JSON schema."),
339
+ )
340
+ guided_regex: Optional[str] = Field(
341
+ default=None,
342
+ description=(
343
+ "If specified, the output will follow the regex pattern."),
344
+ )
345
+ guided_choice: Optional[list[str]] = Field(
346
+ default=None,
347
+ description=(
348
+ "If specified, the output will be exactly one of the choices."),
349
+ )
350
+ guided_grammar: Optional[str] = Field(
351
+ default=None,
352
+ description=(
353
+ "If specified, the output will follow the context free grammar."),
354
+ )
355
+ structural_tag: Optional[str] = Field(
356
+ default=None,
357
+ description=(
358
+ "If specified, the output will follow the structural tag schema."),
359
+ )
360
+ guided_decoding_backend: Optional[str] = Field(
361
+ default=None,
362
+ description=(
363
+ "If specified, will override the default guided decoding backend "
364
+ "of the server for this specific request. If set, must be either "
365
+ "'outlines' / 'lm-format-enforcer'"),
366
+ )
367
+ guided_whitespace_pattern: Optional[str] = Field(
368
+ default=None,
369
+ description=(
370
+ "If specified, will override the default whitespace pattern "
371
+ "for guided json decoding."),
372
+ )
373
+ priority: int = Field(
374
+ default=0,
375
+ description=(
376
+ "The priority of the request (lower means earlier handling; "
377
+ "default: 0). Any priority other than 0 will raise an error "
378
+ "if the served model does not use priority scheduling."),
379
+ )
380
+ request_id: str = Field(
381
+ default_factory=lambda: f"{random_uuid()}",
382
+ description=(
383
+ "The request_id related to this request. If the caller does "
384
+ "not set it, a random_uuid will be generated. This id is used "
385
+ "through out the inference process and return in response."),
386
+ )
387
+ logits_processors: Optional[LogitsProcessors] = Field(
388
+ default=None,
389
+ description=(
390
+ "A list of either qualified names of logits processors, or "
391
+ "constructor objects, to apply when sampling. A constructor is "
392
+ "a JSON object with a required 'qualname' field specifying the "
393
+ "qualified name of the processor class/factory, and optional "
394
+ "'args' and 'kwargs' fields containing positional and keyword "
395
+ "arguments. For example: {'qualname': "
396
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
397
+ "{'param': 'value'}}."))
398
+ return_tokens_as_token_ids: Optional[bool] = Field(
399
+ default=None,
400
+ description=(
401
+ "If specified with 'logprobs', tokens are represented "
402
+ " as strings of the form 'token_id:{token_id}' so that tokens "
403
+ "that are not JSON-encodable can be identified."))
404
+ cache_salt: Optional[str] = Field(
405
+ default=None,
406
+ description=(
407
+ "If specified, the prefix cache will be salted with the provided "
408
+ "string to prevent an attacker to guess prompts in multi-user "
409
+ "environments. The salt should be random, protected from "
410
+ "access by 3rd parties, and long enough to be "
411
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
412
+ "to 256 bit). Not supported by vLLM engine V0."))
413
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
414
+ default=None,
415
+ description="KVTransfer parameters used for disaggregated serving.")
416
+
417
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
418
+ default=None,
419
+ description=("Additional request parameters with string or "
420
+ "numeric values, used by custom extensions."),
421
+ )
422
+
423
+ # --8<-- [end:chat-completion-extra-params]
424
+
425
+ # Default sampling parameters for chat completion requests
426
+ _DEFAULT_SAMPLING_PARAMS: dict = {
427
+ "repetition_penalty": 1.0,
428
+ "temperature": 1.0,
429
+ "top_p": 1.0,
430
+ "top_k": 0,
431
+ "min_p": 0.0,
432
+ }
433
+
434
+ def to_beam_search_params(
435
+ self, max_tokens: int,
436
+ default_sampling_params: dict) -> BeamSearchParams:
437
+
438
+ n = self.n if self.n is not None else 1
439
+ if (temperature := self.temperature) is None:
440
+ temperature = default_sampling_params.get(
441
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
442
+
443
+ return BeamSearchParams(
444
+ beam_width=n,
445
+ max_tokens=max_tokens,
446
+ ignore_eos=self.ignore_eos,
447
+ temperature=temperature,
448
+ length_penalty=self.length_penalty,
449
+ include_stop_str_in_output=self.include_stop_str_in_output,
450
+ )
451
+
452
+ def to_sampling_params(
453
+ self,
454
+ max_tokens: int,
455
+ logits_processor_pattern: Optional[str],
456
+ default_sampling_params: dict,
457
+ ) -> SamplingParams:
458
+
459
+ # Default parameters
460
+ if (repetition_penalty := self.repetition_penalty) is None:
461
+ repetition_penalty = default_sampling_params.get(
462
+ "repetition_penalty",
463
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
464
+ )
465
+ if (temperature := self.temperature) is None:
466
+ temperature = default_sampling_params.get(
467
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
468
+ if (top_p := self.top_p) is None:
469
+ top_p = default_sampling_params.get(
470
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
471
+ if (top_k := self.top_k) is None:
472
+ top_k = default_sampling_params.get(
473
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
474
+ if (min_p := self.min_p) is None:
475
+ min_p = default_sampling_params.get(
476
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
477
+
478
+ prompt_logprobs = self.prompt_logprobs
479
+ if prompt_logprobs is None and self.echo:
480
+ prompt_logprobs = self.top_logprobs
481
+
482
+ guided_json_object = None
483
+ if self.response_format is not None:
484
+ if self.response_format.type == "json_object":
485
+ guided_json_object = True
486
+ elif self.response_format.type == "json_schema":
487
+ json_schema = self.response_format.json_schema
488
+ assert json_schema is not None
489
+ self.guided_json = json_schema.json_schema
490
+ elif self.response_format.type == "structural_tag":
491
+ structural_tag = self.response_format
492
+ assert structural_tag is not None and isinstance(
493
+ structural_tag, StructuralTagResponseFormat)
494
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
495
+ self.structural_tag = json.dumps(s_tag_obj)
496
+
497
+ guided_decoding = GuidedDecodingParams.from_optional(
498
+ json=self._get_guided_json_from_tool() or self.guided_json,
499
+ regex=self.guided_regex,
500
+ choice=self.guided_choice,
501
+ grammar=self.guided_grammar,
502
+ json_object=guided_json_object,
503
+ backend=self.guided_decoding_backend,
504
+ whitespace_pattern=self.guided_whitespace_pattern,
505
+ structural_tag=self.structural_tag,
506
+ )
507
+
508
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
509
+ if self.kv_transfer_params:
510
+ # Pass in kv_transfer_params via extra_args
511
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
512
+ return SamplingParams.from_optional(
513
+ n=self.n,
514
+ best_of=self.best_of,
515
+ presence_penalty=self.presence_penalty,
516
+ frequency_penalty=self.frequency_penalty,
517
+ repetition_penalty=repetition_penalty,
518
+ temperature=temperature,
519
+ top_p=top_p,
520
+ top_k=top_k,
521
+ min_p=min_p,
522
+ seed=self.seed,
523
+ stop=self.stop,
524
+ stop_token_ids=self.stop_token_ids,
525
+ logprobs=self.top_logprobs if self.logprobs else None,
526
+ prompt_logprobs=prompt_logprobs,
527
+ ignore_eos=self.ignore_eos,
528
+ max_tokens=max_tokens,
529
+ min_tokens=self.min_tokens,
530
+ skip_special_tokens=self.skip_special_tokens,
531
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
532
+ logits_processors=get_logits_processors(self.logits_processors,
533
+ logits_processor_pattern),
534
+ include_stop_str_in_output=self.include_stop_str_in_output,
535
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
536
+ output_kind=RequestOutputKind.DELTA if self.stream \
537
+ else RequestOutputKind.FINAL_ONLY,
538
+ guided_decoding=guided_decoding,
539
+ logit_bias=self.logit_bias,
540
+ bad_words= self.bad_words,
541
+ allowed_token_ids=self.allowed_token_ids,
542
+ extra_args=extra_args or None,
543
+ )
544
+
545
+ def _get_guided_json_from_tool(
546
+ self) -> Optional[Union[str, dict, BaseModel]]:
547
+ # user has chosen to not use any tool
548
+ if self.tool_choice == "none" or self.tools is None:
549
+ return None
550
+
551
+ # user has chosen to use a named tool
552
+ if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
553
+ tool_name = self.tool_choice.function.name
554
+ tools = {tool.function.name: tool.function for tool in self.tools}
555
+ if tool_name not in tools:
556
+ raise ValueError(
557
+ f"Tool '{tool_name}' has not been passed in `tools`.")
558
+ tool = tools[tool_name]
559
+ return tool.parameters
560
+
561
+ if self.tool_choice == "required":
562
+ # Pydantic schema generation cannot be used since the JSON schema
563
+ # has to be constructed for a specific instantiation of a tool list
564
+ # so that parameters of a function are correctly generated
565
+ # based on the chosen function name
566
+ def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
567
+ return {
568
+ "properties": {
569
+ "name": {
570
+ "type": "string",
571
+ "enum": [tool.function.name]
572
+ },
573
+ # parameters are always generated as '{}' in the final
574
+ # output if they are missing from the request
575
+ # (i.e. are None or '{}') so the schema is
576
+ # updated to produce an empty object in that case
577
+ "parameters": tool.function.parameters
578
+ if tool.function.parameters else {
579
+ "type": "object",
580
+ "properties": {}
581
+ }
582
+ },
583
+ "required": ["name", "parameters"]
584
+ }
585
+
586
+ json_schema = {
587
+ "type": "array",
588
+ "minItems": 1,
589
+ "items": {
590
+ "type": "object",
591
+ "anyOf": [get_tool_schema(tool) for tool in self.tools]
592
+ }
593
+ }
594
+ return json_schema
595
+
596
+ return None
597
+
598
+ @model_validator(mode="before")
599
+ @classmethod
600
+ def validate_stream_options(cls, data):
601
+ if data.get("stream_options") and not data.get("stream"):
602
+ raise ValueError(
603
+ "Stream options can only be defined when `stream=True`.")
604
+
605
+ return data
606
+
607
+ @model_validator(mode="before")
608
+ @classmethod
609
+ def check_logprobs(cls, data):
610
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
611
+ if data.get("stream") and prompt_logprobs > 0:
612
+ raise ValueError(
613
+ "`prompt_logprobs` are not available when `stream=True`.")
614
+
615
+ if prompt_logprobs < 0:
616
+ raise ValueError("`prompt_logprobs` must be a positive value.")
617
+
618
+ if (top_logprobs := data.get("top_logprobs")) is not None:
619
+ if top_logprobs < 0:
620
+ raise ValueError("`top_logprobs` must be a positive value.")
621
+
622
+ if top_logprobs > 0 and not data.get("logprobs"):
623
+ raise ValueError(
624
+ "when using `top_logprobs`, `logprobs` must be set to true."
625
+ )
626
+
627
+ return data
628
+
629
+ @model_validator(mode="before")
630
+ @classmethod
631
+ def check_guided_decoding_count(cls, data):
632
+ if isinstance(data, ValueError):
633
+ raise data
634
+
635
+ guide_count = sum([
636
+ "guided_json" in data and data["guided_json"] is not None,
637
+ "guided_regex" in data and data["guided_regex"] is not None,
638
+ "guided_choice" in data and data["guided_choice"] is not None
639
+ ])
640
+ # you can only use one kind of guided decoding
641
+ if guide_count > 1:
642
+ raise ValueError(
643
+ "You can only use one kind of guided decoding "
644
+ "('guided_json', 'guided_regex' or 'guided_choice').")
645
+ # you can only either use guided decoding or tools, not both
646
+ if guide_count > 1 and data.get("tool_choice", "none") not in (
647
+ "none",
648
+ "auto",
649
+ "required",
650
+ ):
651
+ raise ValueError(
652
+ "You can only either use guided decoding or tools, not both.")
653
+ return data
654
+
655
+ @model_validator(mode="before")
656
+ @classmethod
657
+ def check_tool_usage(cls, data):
658
+
659
+ # if "tool_choice" is not specified but tools are provided,
660
+ # default to "auto" tool_choice
661
+ if "tool_choice" not in data and data.get("tools"):
662
+ data["tool_choice"] = "auto"
663
+
664
+ # if "tool_choice" is "none" -- no validation is needed for tools
665
+ if "tool_choice" in data and data["tool_choice"] == "none":
666
+ return data
667
+
668
+ # if "tool_choice" is specified -- validation
669
+ if "tool_choice" in data:
670
+
671
+ # ensure that if "tool choice" is specified, tools are present
672
+ if "tools" not in data or data["tools"] is None:
673
+ raise ValueError(
674
+ "When using `tool_choice`, `tools` must be set.")
675
+
676
+ # make sure that tool choice is either a named tool
677
+ # OR that it's set to "auto" or "required"
678
+ if data["tool_choice"] not in [
679
+ "auto", "required"
680
+ ] and not isinstance(data["tool_choice"], dict):
681
+ raise NotImplementedError(
682
+ f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
683
+ 'Only named tools, "none", "auto" or "required" '\
684
+ 'are supported.'
685
+ )
686
+
687
+ # ensure that if "tool_choice" is specified as an object,
688
+ # it matches a valid tool
689
+ correct_usage_message = 'Correct usage: `{"type": "function",' \
690
+ ' "function": {"name": "my_function"}}`'
691
+ if isinstance(data["tool_choice"], dict):
692
+ valid_tool = False
693
+ function = data["tool_choice"].get("function")
694
+ if not isinstance(function, dict):
695
+ raise ValueError(
696
+ f"Invalid value for `function`: `{function}` in "
697
+ f"`tool_choice`! {correct_usage_message}")
698
+ if "name" not in function:
699
+ raise ValueError(f"Expected field `name` in `function` in "
700
+ f"`tool_choice`! {correct_usage_message}")
701
+ function_name = function["name"]
702
+ if not isinstance(function_name,
703
+ str) or len(function_name) == 0:
704
+ raise ValueError(
705
+ f"Invalid `name` in `function`: `{function_name}`"
706
+ f" in `tool_choice`! {correct_usage_message}")
707
+ for tool in data["tools"]:
708
+ if tool["function"]["name"] == function_name:
709
+ valid_tool = True
710
+ break
711
+ if not valid_tool:
712
+ raise ValueError(
713
+ "The tool specified in `tool_choice` does not match any"
714
+ " of the specified `tools`")
715
+ return data
716
+
717
+ @model_validator(mode="before")
718
+ @classmethod
719
+ def check_generation_prompt(cls, data):
720
+ if data.get("continue_final_message") and data.get(
721
+ "add_generation_prompt"):
722
+ raise ValueError("Cannot set both `continue_final_message` and "
723
+ "`add_generation_prompt` to True.")
724
+ return data
725
+
726
+ @model_validator(mode="before")
727
+ @classmethod
728
+ def check_cache_salt_support(cls, data):
729
+ if data.get("cache_salt") is not None:
730
+ if not envs.VLLM_USE_V1:
731
+ raise ValueError(
732
+ "Parameter 'cache_salt' is not supported with "
733
+ "this instance of vLLM, which uses engine V0.")
734
+ if not isinstance(data["cache_salt"],
735
+ str) or not data["cache_salt"]:
736
+ raise ValueError("Parameter 'cache_salt' must be a "
737
+ "non-empty string if provided.")
738
+ return data
739
+
740
+
741
+ class CompletionRequest(OpenAIBaseModel):
742
+ # Ordered by official OpenAI API documentation
743
+ # https://platform.openai.com/docs/api-reference/completions/create
744
+ model: Optional[str] = None
745
+ prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
746
+ prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
747
+ best_of: Optional[int] = None
748
+ echo: Optional[bool] = False
749
+ frequency_penalty: Optional[float] = 0.0
750
+ logit_bias: Optional[dict[str, float]] = None
751
+ logprobs: Optional[int] = None
752
+ max_tokens: Optional[int] = 16
753
+ n: int = 1
754
+ presence_penalty: Optional[float] = 0.0
755
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
756
+ stop: Optional[Union[str, list[str]]] = []
757
+ stream: Optional[bool] = False
758
+ stream_options: Optional[StreamOptions] = None
759
+ suffix: Optional[str] = None
760
+ temperature: Optional[float] = None
761
+ top_p: Optional[float] = None
762
+ user: Optional[str] = None
763
+
764
+ # --8<-- [start:completion-sampling-params]
765
+ use_beam_search: bool = False
766
+ top_k: Optional[int] = None
767
+ min_p: Optional[float] = None
768
+ repetition_penalty: Optional[float] = None
769
+ length_penalty: float = 1.0
770
+ stop_token_ids: Optional[list[int]] = []
771
+ include_stop_str_in_output: bool = False
772
+ ignore_eos: bool = False
773
+ min_tokens: int = 0
774
+ skip_special_tokens: bool = True
775
+ spaces_between_special_tokens: bool = True
776
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
777
+ allowed_token_ids: Optional[list[int]] = None
778
+ prompt_logprobs: Optional[int] = None
779
+ # --8<-- [end:completion-sampling-params]
780
+
781
+ # --8<-- [start:completion-extra-params]
782
+ add_special_tokens: bool = Field(
783
+ default=True,
784
+ description=(
785
+ "If true (the default), special tokens (e.g. BOS) will be added to "
786
+ "the prompt."),
787
+ )
788
+ response_format: Optional[AnyResponseFormat] = Field(
789
+ default=None,
790
+ description=(
791
+ "Similar to chat completion, this parameter specifies the format "
792
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
793
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
794
+ ),
795
+ )
796
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
797
+ default=None,
798
+ description="If specified, the output will follow the JSON schema.",
799
+ )
800
+ guided_regex: Optional[str] = Field(
801
+ default=None,
802
+ description=(
803
+ "If specified, the output will follow the regex pattern."),
804
+ )
805
+ guided_choice: Optional[list[str]] = Field(
806
+ default=None,
807
+ description=(
808
+ "If specified, the output will be exactly one of the choices."),
809
+ )
810
+ guided_grammar: Optional[str] = Field(
811
+ default=None,
812
+ description=(
813
+ "If specified, the output will follow the context free grammar."),
814
+ )
815
+ guided_decoding_backend: Optional[str] = Field(
816
+ default=None,
817
+ description=(
818
+ "If specified, will override the default guided decoding backend "
819
+ "of the server for this specific request. If set, must be one of "
820
+ "'outlines' / 'lm-format-enforcer'"),
821
+ )
822
+ guided_whitespace_pattern: Optional[str] = Field(
823
+ default=None,
824
+ description=(
825
+ "If specified, will override the default whitespace pattern "
826
+ "for guided json decoding."),
827
+ )
828
+ priority: int = Field(
829
+ default=0,
830
+ description=(
831
+ "The priority of the request (lower means earlier handling; "
832
+ "default: 0). Any priority other than 0 will raise an error "
833
+ "if the served model does not use priority scheduling."),
834
+ )
835
+ logits_processors: Optional[LogitsProcessors] = Field(
836
+ default=None,
837
+ description=(
838
+ "A list of either qualified names of logits processors, or "
839
+ "constructor objects, to apply when sampling. A constructor is "
840
+ "a JSON object with a required 'qualname' field specifying the "
841
+ "qualified name of the processor class/factory, and optional "
842
+ "'args' and 'kwargs' fields containing positional and keyword "
843
+ "arguments. For example: {'qualname': "
844
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
845
+ "{'param': 'value'}}."))
846
+
847
+ return_tokens_as_token_ids: Optional[bool] = Field(
848
+ default=None,
849
+ description=(
850
+ "If specified with 'logprobs', tokens are represented "
851
+ " as strings of the form 'token_id:{token_id}' so that tokens "
852
+ "that are not JSON-encodable can be identified."))
853
+
854
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
855
+ default=None,
856
+ description="KVTransfer parameters used for disaggregated serving.")
857
+
858
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
859
+ default=None,
860
+ description=("Additional request parameters with string or "
861
+ "numeric values, used by custom extensions."),
862
+ )
863
+
864
+ # --8<-- [end:completion-extra-params]
865
+
866
+ # Default sampling parameters for completion requests
867
+ _DEFAULT_SAMPLING_PARAMS: dict = {
868
+ "repetition_penalty": 1.0,
869
+ "temperature": 1.0,
870
+ "top_p": 1.0,
871
+ "top_k": 0,
872
+ "min_p": 0.0,
873
+ }
874
+
875
+ def to_beam_search_params(
876
+ self,
877
+ max_tokens: int,
878
+ default_sampling_params: Optional[dict] = None,
879
+ ) -> BeamSearchParams:
880
+
881
+ if default_sampling_params is None:
882
+ default_sampling_params = {}
883
+ n = self.n if self.n is not None else 1
884
+
885
+ if (temperature := self.temperature) is None:
886
+ temperature = default_sampling_params.get("temperature", 1.0)
887
+
888
+ return BeamSearchParams(
889
+ beam_width=n,
890
+ max_tokens=max_tokens,
891
+ ignore_eos=self.ignore_eos,
892
+ temperature=temperature,
893
+ length_penalty=self.length_penalty,
894
+ include_stop_str_in_output=self.include_stop_str_in_output,
895
+ )
896
+
897
+ def to_sampling_params(
898
+ self,
899
+ max_tokens: int,
900
+ logits_processor_pattern: Optional[str],
901
+ default_sampling_params: Optional[dict] = None,
902
+ ) -> SamplingParams:
903
+
904
+ if default_sampling_params is None:
905
+ default_sampling_params = {}
906
+
907
+ # Default parameters
908
+ if (repetition_penalty := self.repetition_penalty) is None:
909
+ repetition_penalty = default_sampling_params.get(
910
+ "repetition_penalty",
911
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
912
+ )
913
+ if (temperature := self.temperature) is None:
914
+ temperature = default_sampling_params.get(
915
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
916
+ if (top_p := self.top_p) is None:
917
+ top_p = default_sampling_params.get(
918
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
919
+ if (top_k := self.top_k) is None:
920
+ top_k = default_sampling_params.get(
921
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
922
+ if (min_p := self.min_p) is None:
923
+ min_p = default_sampling_params.get(
924
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
925
+
926
+ prompt_logprobs = self.prompt_logprobs
927
+ if prompt_logprobs is None and self.echo:
928
+ prompt_logprobs = self.logprobs
929
+
930
+ echo_without_generation = self.echo and self.max_tokens == 0
931
+
932
+ guided_json_object = None
933
+ if (self.response_format is not None
934
+ and self.response_format.type == "json_object"):
935
+ guided_json_object = True
936
+
937
+ guided_decoding = GuidedDecodingParams.from_optional(
938
+ json=self.guided_json,
939
+ regex=self.guided_regex,
940
+ choice=self.guided_choice,
941
+ grammar=self.guided_grammar,
942
+ json_object=guided_json_object,
943
+ backend=self.guided_decoding_backend,
944
+ whitespace_pattern=self.guided_whitespace_pattern,
945
+ )
946
+
947
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
948
+ if self.kv_transfer_params:
949
+ # Pass in kv_transfer_params via extra_args
950
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
951
+ return SamplingParams.from_optional(
952
+ n=self.n,
953
+ best_of=self.best_of,
954
+ presence_penalty=self.presence_penalty,
955
+ frequency_penalty=self.frequency_penalty,
956
+ repetition_penalty=repetition_penalty,
957
+ temperature=temperature,
958
+ top_p=top_p,
959
+ top_k=top_k,
960
+ min_p=min_p,
961
+ seed=self.seed,
962
+ stop=self.stop,
963
+ stop_token_ids=self.stop_token_ids,
964
+ logprobs=self.logprobs,
965
+ ignore_eos=self.ignore_eos,
966
+ max_tokens=max_tokens if not echo_without_generation else 1,
967
+ min_tokens=self.min_tokens,
968
+ prompt_logprobs=prompt_logprobs,
969
+ skip_special_tokens=self.skip_special_tokens,
970
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
971
+ include_stop_str_in_output=self.include_stop_str_in_output,
972
+ logits_processors=get_logits_processors(self.logits_processors,
973
+ logits_processor_pattern),
974
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
975
+ output_kind=RequestOutputKind.DELTA if self.stream \
976
+ else RequestOutputKind.FINAL_ONLY,
977
+ guided_decoding=guided_decoding,
978
+ logit_bias=self.logit_bias,
979
+ allowed_token_ids=self.allowed_token_ids,
980
+ extra_args=extra_args or None,
981
+ )
982
+
983
+ @model_validator(mode="before")
984
+ @classmethod
985
+ def check_guided_decoding_count(cls, data):
986
+ guide_count = sum([
987
+ "guided_json" in data and data["guided_json"] is not None,
988
+ "guided_regex" in data and data["guided_regex"] is not None,
989
+ "guided_choice" in data and data["guided_choice"] is not None
990
+ ])
991
+ if guide_count > 1:
992
+ raise ValueError(
993
+ "You can only use one kind of guided decoding "
994
+ "('guided_json', 'guided_regex' or 'guided_choice').")
995
+ return data
996
+
997
+ @model_validator(mode="before")
998
+ @classmethod
999
+ def check_logprobs(cls, data):
1000
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1001
+ if data.get("stream") and prompt_logprobs > 0:
1002
+ raise ValueError(
1003
+ "`prompt_logprobs` are not available when `stream=True`.")
1004
+
1005
+ if prompt_logprobs < 0:
1006
+ raise ValueError("`prompt_logprobs` must be a positive value.")
1007
+
1008
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1009
+ raise ValueError("`logprobs` must be a positive value.")
1010
+
1011
+ return data
1012
+
1013
+ @model_validator(mode="before")
1014
+ @classmethod
1015
+ def validate_stream_options(cls, data):
1016
+ if data.get("stream_options") and not data.get("stream"):
1017
+ raise ValueError(
1018
+ "Stream options can only be defined when `stream=True`.")
1019
+
1020
+ return data
1021
+
1022
+ @model_validator(mode="before")
1023
+ @classmethod
1024
+ def validate_prompt_and_prompt_embeds(cls, data):
1025
+ if data.get("prompt") is None and data.get("prompt_embeds") is None:
1026
+ raise ValueError(
1027
+ "At least one of `prompt` or `prompt_embeds` must be set.")
1028
+ return data
1029
+
1030
+
1031
+ class EmbeddingCompletionRequest(OpenAIBaseModel):
1032
+ # Ordered by official OpenAI API documentation
1033
+ # https://platform.openai.com/docs/api-reference/embeddings
1034
+ model: Optional[str] = None
1035
+ input: Union[list[int], list[list[int]], str, list[str]]
1036
+ encoding_format: Literal["float", "base64"] = "float"
1037
+ dimensions: Optional[int] = None
1038
+ user: Optional[str] = None
1039
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1040
+
1041
+ # --8<-- [start:embedding-pooling-params]
1042
+ additional_data: Optional[Any] = None
1043
+ # --8<-- [end:embedding-pooling-params]
1044
+
1045
+ # --8<-- [start:embedding-extra-params]
1046
+ add_special_tokens: bool = Field(
1047
+ default=True,
1048
+ description=(
1049
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1050
+ "the prompt."),
1051
+ )
1052
+ priority: int = Field(
1053
+ default=0,
1054
+ description=(
1055
+ "The priority of the request (lower means earlier handling; "
1056
+ "default: 0). Any priority other than 0 will raise an error "
1057
+ "if the served model does not use priority scheduling."),
1058
+ )
1059
+
1060
+ # --8<-- [end:embedding-extra-params]
1061
+
1062
+ def to_pooling_params(self):
1063
+ return PoolingParams(dimensions=self.dimensions,
1064
+ additional_data=self.additional_data)
1065
+
1066
+
1067
+ class EmbeddingChatRequest(OpenAIBaseModel):
1068
+ model: Optional[str] = None
1069
+ messages: list[ChatCompletionMessageParam]
1070
+
1071
+ encoding_format: Literal["float", "base64"] = "float"
1072
+ dimensions: Optional[int] = None
1073
+ user: Optional[str] = None
1074
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1075
+
1076
+ # --8<-- [start:chat-embedding-pooling-params]
1077
+ additional_data: Optional[Any] = None
1078
+ # --8<-- [end:chat-embedding-pooling-params]
1079
+
1080
+ # --8<-- [start:chat-embedding-extra-params]
1081
+ add_special_tokens: bool = Field(
1082
+ default=False,
1083
+ description=(
1084
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1085
+ "on top of what is added by the chat template. "
1086
+ "For most models, the chat template takes care of adding the "
1087
+ "special tokens so this should be set to false (as is the "
1088
+ "default)."),
1089
+ )
1090
+ chat_template: Optional[str] = Field(
1091
+ default=None,
1092
+ description=(
1093
+ "A Jinja template to use for this conversion. "
1094
+ "As of transformers v4.44, default chat template is no longer "
1095
+ "allowed, so you must provide a chat template if the tokenizer "
1096
+ "does not define one."),
1097
+ )
1098
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1099
+ default=None,
1100
+ description=(
1101
+ "Additional keyword args to pass to the template renderer. "
1102
+ "Will be accessible by the chat template."),
1103
+ )
1104
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1105
+ default=None,
1106
+ description=("Additional kwargs to pass to the HF processor."),
1107
+ )
1108
+ priority: int = Field(
1109
+ default=0,
1110
+ description=(
1111
+ "The priority of the request (lower means earlier handling; "
1112
+ "default: 0). Any priority other than 0 will raise an error "
1113
+ "if the served model does not use priority scheduling."),
1114
+ )
1115
+ # --8<-- [end:chat-embedding-extra-params]
1116
+
1117
+ @model_validator(mode="before")
1118
+ @classmethod
1119
+ def check_generation_prompt(cls, data):
1120
+ if data.get("continue_final_message") and data.get(
1121
+ "add_generation_prompt"):
1122
+ raise ValueError("Cannot set both `continue_final_message` and "
1123
+ "`add_generation_prompt` to True.")
1124
+ return data
1125
+
1126
+ def to_pooling_params(self):
1127
+ return PoolingParams(dimensions=self.dimensions,
1128
+ additional_data=self.additional_data)
1129
+
1130
+
1131
+ EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
1132
+
1133
+ PoolingCompletionRequest = EmbeddingCompletionRequest
1134
+ PoolingChatRequest = EmbeddingChatRequest
1135
+ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
1136
+
1137
+
1138
+ class ScoreRequest(OpenAIBaseModel):
1139
+ model: Optional[str] = None
1140
+ text_1: Union[list[str], str]
1141
+ text_2: Union[list[str], str]
1142
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1143
+
1144
+ # --8<-- [start:score-pooling-params]
1145
+ additional_data: Optional[Any] = None
1146
+ # --8<-- [end:score-pooling-params]
1147
+
1148
+ # --8<-- [start:score-extra-params]
1149
+ priority: int = Field(
1150
+ default=0,
1151
+ description=(
1152
+ "The priority of the request (lower means earlier handling; "
1153
+ "default: 0). Any priority other than 0 will raise an error "
1154
+ "if the served model does not use priority scheduling."),
1155
+ )
1156
+
1157
+ # --8<-- [end:score-extra-params]
1158
+
1159
+ def to_pooling_params(self, *, use_cross_encoder: bool = False):
1160
+ return PoolingParams(use_cross_encoder=use_cross_encoder,
1161
+ additional_data=self.additional_data)
1162
+
1163
+
1164
+ class RerankRequest(OpenAIBaseModel):
1165
+ model: Optional[str] = None
1166
+ query: str
1167
+ documents: list[str]
1168
+ top_n: int = Field(default_factory=lambda: 0)
1169
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1170
+
1171
+ # --8<-- [start:rerank-pooling-params]
1172
+ additional_data: Optional[Any] = None
1173
+ # --8<-- [end:rerank-pooling-params]
1174
+
1175
+ # --8<-- [start:rerank-extra-params]
1176
+ priority: int = Field(
1177
+ default=0,
1178
+ description=(
1179
+ "The priority of the request (lower means earlier handling; "
1180
+ "default: 0). Any priority other than 0 will raise an error "
1181
+ "if the served model does not use priority scheduling."),
1182
+ )
1183
+
1184
+ # --8<-- [end:rerank-extra-params]
1185
+
1186
+ def to_pooling_params(self, *, use_cross_encoder: bool = False):
1187
+ return PoolingParams(use_cross_encoder=use_cross_encoder,
1188
+ additional_data=self.additional_data)
1189
+
1190
+
1191
+ class RerankDocument(BaseModel):
1192
+ text: str
1193
+
1194
+
1195
+ class RerankResult(BaseModel):
1196
+ index: int
1197
+ document: RerankDocument
1198
+ relevance_score: float
1199
+
1200
+
1201
+ class RerankUsage(BaseModel):
1202
+ total_tokens: int
1203
+
1204
+
1205
+ class RerankResponse(OpenAIBaseModel):
1206
+ id: str
1207
+ model: str
1208
+ usage: RerankUsage
1209
+ results: list[RerankResult]
1210
+
1211
+
1212
+ class CompletionLogProbs(OpenAIBaseModel):
1213
+ text_offset: list[int] = Field(default_factory=list)
1214
+ token_logprobs: list[Optional[float]] = Field(default_factory=list)
1215
+ tokens: list[str] = Field(default_factory=list)
1216
+ top_logprobs: list[Optional[dict[str,
1217
+ float]]] = Field(default_factory=list)
1218
+
1219
+
1220
+ class CompletionResponseChoice(OpenAIBaseModel):
1221
+ index: int
1222
+ text: str
1223
+ logprobs: Optional[CompletionLogProbs] = None
1224
+ finish_reason: Optional[str] = None
1225
+ stop_reason: Optional[Union[int, str]] = Field(
1226
+ default=None,
1227
+ description=(
1228
+ "The stop string or token id that caused the completion "
1229
+ "to stop, None if the completion finished for some other reason "
1230
+ "including encountering the EOS token"),
1231
+ )
1232
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1233
+
1234
+
1235
+ class CompletionResponse(OpenAIBaseModel):
1236
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1237
+ object: str = "text_completion"
1238
+ created: int = Field(default_factory=lambda: int(time.time()))
1239
+ model: str
1240
+ choices: list[CompletionResponseChoice]
1241
+ usage: UsageInfo
1242
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1243
+ default=None, description="KVTransfer parameters.")
1244
+
1245
+
1246
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1247
+ index: int
1248
+ text: str
1249
+ logprobs: Optional[CompletionLogProbs] = None
1250
+ finish_reason: Optional[str] = None
1251
+ stop_reason: Optional[Union[int, str]] = Field(
1252
+ default=None,
1253
+ description=(
1254
+ "The stop string or token id that caused the completion "
1255
+ "to stop, None if the completion finished for some other reason "
1256
+ "including encountering the EOS token"),
1257
+ )
1258
+
1259
+
1260
+ class CompletionStreamResponse(OpenAIBaseModel):
1261
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1262
+ object: str = "text_completion"
1263
+ created: int = Field(default_factory=lambda: int(time.time()))
1264
+ model: str
1265
+ choices: list[CompletionResponseStreamChoice]
1266
+ usage: Optional[UsageInfo] = Field(default=None)
1267
+
1268
+
1269
+ class EmbeddingResponseData(OpenAIBaseModel):
1270
+ index: int
1271
+ object: str = "embedding"
1272
+ embedding: Union[list[float], str]
1273
+
1274
+
1275
+ class EmbeddingResponse(OpenAIBaseModel):
1276
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1277
+ object: str = "list"
1278
+ created: int = Field(default_factory=lambda: int(time.time()))
1279
+ model: str
1280
+ data: list[EmbeddingResponseData]
1281
+ usage: UsageInfo
1282
+
1283
+
1284
+ class PoolingResponseData(OpenAIBaseModel):
1285
+ index: int
1286
+ object: str = "pooling"
1287
+ data: Union[list[list[float]], list[float], str]
1288
+
1289
+
1290
+ class PoolingResponse(OpenAIBaseModel):
1291
+ id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
1292
+ object: str = "list"
1293
+ created: int = Field(default_factory=lambda: int(time.time()))
1294
+ model: str
1295
+ data: list[PoolingResponseData]
1296
+ usage: UsageInfo
1297
+
1298
+
1299
+ class ScoreResponseData(OpenAIBaseModel):
1300
+ index: int
1301
+ object: str = "score"
1302
+ score: float
1303
+
1304
+
1305
+ class ScoreResponse(OpenAIBaseModel):
1306
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1307
+ object: str = "list"
1308
+ created: int = Field(default_factory=lambda: int(time.time()))
1309
+ model: str
1310
+ data: list[ScoreResponseData]
1311
+ usage: UsageInfo
1312
+
1313
+
1314
+ class ClassificationRequest(OpenAIBaseModel):
1315
+ model: Optional[str] = None
1316
+ input: Union[list[str], str]
1317
+ truncate_prompt_tokens: Optional[int] = None
1318
+ user: Optional[str] = None
1319
+
1320
+ # --8<-- [start:classification-pooling-params]
1321
+ additional_data: Optional[Any] = None
1322
+ # --8<-- [end:classification-pooling-params]
1323
+
1324
+ # --8<-- [start:classification-extra-params]
1325
+ priority: int = Field(
1326
+ default=0,
1327
+ description=(
1328
+ "The priority of the request (lower means earlier handling; "
1329
+ "default: 0). Any priority other than 0 will raise an error "
1330
+ "if the served model does not use priority scheduling."),
1331
+ )
1332
+
1333
+ # --8<-- [end:classification-extra-params]
1334
+
1335
+ def to_pooling_params(self):
1336
+ return PoolingParams(additional_data=self.additional_data)
1337
+
1338
+
1339
+ class ClassificationData(OpenAIBaseModel):
1340
+ index: int
1341
+ label: Optional[str]
1342
+ probs: list[float]
1343
+ num_classes: int
1344
+
1345
+
1346
+ class ClassificationResponse(OpenAIBaseModel):
1347
+ id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
1348
+ object: str = "list"
1349
+ created: int = Field(default_factory=lambda: int(time.time()))
1350
+ model: str
1351
+ data: list[ClassificationData]
1352
+ usage: UsageInfo
1353
+
1354
+
1355
+ class FunctionCall(OpenAIBaseModel):
1356
+ name: str
1357
+ arguments: str
1358
+
1359
+
1360
+ class ToolCall(OpenAIBaseModel):
1361
+ id: str = Field(default_factory=random_tool_call_id)
1362
+ type: Literal["function"] = "function"
1363
+ function: FunctionCall
1364
+
1365
+
1366
+ class DeltaFunctionCall(BaseModel):
1367
+ name: Optional[str] = None
1368
+ arguments: Optional[str] = None
1369
+
1370
+
1371
+ # a tool call delta where everything is optional
1372
+ class DeltaToolCall(OpenAIBaseModel):
1373
+ id: Optional[str] = None
1374
+ type: Optional[Literal["function"]] = None
1375
+ index: int
1376
+ function: Optional[DeltaFunctionCall] = None
1377
+
1378
+
1379
+ class ExtractedToolCallInformation(BaseModel):
1380
+ # indicate if tools were called
1381
+ tools_called: bool
1382
+
1383
+ # extracted tool calls
1384
+ tool_calls: list[ToolCall]
1385
+
1386
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1387
+ # But some models will do this intentionally
1388
+ content: Optional[str] = None
1389
+
1390
+
1391
+ class ChatMessage(OpenAIBaseModel):
1392
+ role: str
1393
+ reasoning_content: Optional[str] = None
1394
+ content: Optional[str] = None
1395
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1396
+
1397
+
1398
+ class ChatCompletionLogProb(OpenAIBaseModel):
1399
+ token: str
1400
+ logprob: float = -9999.0
1401
+ bytes: Optional[list[int]] = None
1402
+
1403
+
1404
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1405
+ # Workaround: redefine fields name cache so that it's not
1406
+ # shared with the super class.
1407
+ field_names: ClassVar[Optional[set[str]]] = None
1408
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1409
+
1410
+
1411
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1412
+ content: Optional[list[ChatCompletionLogProbsContent]] = None
1413
+
1414
+
1415
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1416
+ index: int
1417
+ message: ChatMessage
1418
+ logprobs: Optional[ChatCompletionLogProbs] = None
1419
+ # per OpenAI spec this is the default
1420
+ finish_reason: Optional[str] = "stop"
1421
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1422
+ stop_reason: Optional[Union[int, str]] = None
1423
+
1424
+
1425
+ class ChatCompletionResponse(OpenAIBaseModel):
1426
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1427
+ object: Literal["chat.completion"] = "chat.completion"
1428
+ created: int = Field(default_factory=lambda: int(time.time()))
1429
+ model: str
1430
+ choices: list[ChatCompletionResponseChoice]
1431
+ usage: UsageInfo
1432
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1433
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1434
+ default=None, description="KVTransfer parameters.")
1435
+
1436
+
1437
+ class DeltaMessage(OpenAIBaseModel):
1438
+ role: Optional[str] = None
1439
+ content: Optional[str] = None
1440
+ reasoning_content: Optional[str] = None
1441
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1442
+
1443
+
1444
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1445
+ index: int
1446
+ delta: DeltaMessage
1447
+ logprobs: Optional[ChatCompletionLogProbs] = None
1448
+ finish_reason: Optional[str] = None
1449
+ stop_reason: Optional[Union[int, str]] = None
1450
+
1451
+
1452
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1453
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1454
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1455
+ created: int = Field(default_factory=lambda: int(time.time()))
1456
+ model: str
1457
+ choices: list[ChatCompletionResponseStreamChoice]
1458
+ usage: Optional[UsageInfo] = Field(default=None)
1459
+
1460
+
1461
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1462
+ delta: DeltaMessage
1463
+ finish_reason: Optional[str] = None
1464
+ stop_reason: Optional[Union[int, str]] = None
1465
+
1466
+
1467
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1468
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1469
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1470
+ created: int = Field(default_factory=lambda: int(time.time()))
1471
+ model: str
1472
+ choices: list[TranscriptionResponseStreamChoice]
1473
+ usage: Optional[UsageInfo] = Field(default=None)
1474
+
1475
+
1476
+ BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
1477
+ ScoreRequest, RerankRequest]
1478
+
1479
+
1480
+ class BatchRequestInput(OpenAIBaseModel):
1481
+ """
1482
+ The per-line object of the batch input file.
1483
+
1484
+ NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
1485
+ """
1486
+
1487
+ # A developer-provided per-request id that will be used to match outputs to
1488
+ # inputs. Must be unique for each request in a batch.
1489
+ custom_id: str
1490
+
1491
+ # The HTTP method to be used for the request. Currently only POST is
1492
+ # supported.
1493
+ method: str
1494
+
1495
+ # The OpenAI API relative URL to be used for the request. Currently
1496
+ # /v1/chat/completions is supported.
1497
+ url: str
1498
+
1499
+ # The parameters of the request.
1500
+ body: BatchRequestInputBody
1501
+
1502
+ @field_validator('body', mode='plain')
1503
+ @classmethod
1504
+ def check_type_for_url(cls, value: Any, info: ValidationInfo):
1505
+ # Use url to disambiguate models
1506
+ url: str = info.data["url"]
1507
+ if url == "/v1/chat/completions":
1508
+ return ChatCompletionRequest.model_validate(value)
1509
+ if url == "/v1/embeddings":
1510
+ return TypeAdapter(EmbeddingRequest).validate_python(value)
1511
+ if url.endswith("/score"):
1512
+ return ScoreRequest.model_validate(value)
1513
+ if url.endswith("/rerank"):
1514
+ return RerankRequest.model_validate(value)
1515
+ return TypeAdapter(BatchRequestInputBody).validate_python(value)
1516
+
1517
+
1518
+ class BatchResponseData(OpenAIBaseModel):
1519
+ # HTTP status code of the response.
1520
+ status_code: int = 200
1521
+
1522
+ # An unique identifier for the API request.
1523
+ request_id: str
1524
+
1525
+ # The body of the response.
1526
+ body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
1527
+ ScoreResponse, RerankResponse]] = None
1528
+
1529
+
1530
+ class BatchRequestOutput(OpenAIBaseModel):
1531
+ """
1532
+ The per-line object of the batch output and error files
1533
+ """
1534
+
1535
+ id: str
1536
+
1537
+ # A developer-provided per-request id that will be used to match outputs to
1538
+ # inputs.
1539
+ custom_id: str
1540
+
1541
+ response: Optional[BatchResponseData]
1542
+
1543
+ # For requests that failed with a non-HTTP error, this will contain more
1544
+ # information on the cause of the failure.
1545
+ error: Optional[Any]
1546
+
1547
+
1548
+ class TokenizeCompletionRequest(OpenAIBaseModel):
1549
+ model: Optional[str] = None
1550
+ prompt: str
1551
+
1552
+ add_special_tokens: bool = Field(
1553
+ default=True,
1554
+ description=(
1555
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1556
+ "the prompt."),
1557
+ )
1558
+ return_token_strs: Optional[bool] = Field(
1559
+ default=False,
1560
+ description=("If true, also return the token strings "
1561
+ "corresponding to the token ids."),
1562
+ )
1563
+
1564
+
1565
+ class TokenizeChatRequest(OpenAIBaseModel):
1566
+ model: Optional[str] = None
1567
+ messages: list[ChatCompletionMessageParam]
1568
+
1569
+ add_generation_prompt: bool = Field(
1570
+ default=True,
1571
+ description=
1572
+ ("If true, the generation prompt will be added to the chat template. "
1573
+ "This is a parameter used by chat template in tokenizer config of the "
1574
+ "model."),
1575
+ )
1576
+ return_token_strs: Optional[bool] = Field(
1577
+ default=False,
1578
+ description=("If true, also return the token strings "
1579
+ "corresponding to the token ids."),
1580
+ )
1581
+ continue_final_message: bool = Field(
1582
+ default=False,
1583
+ description=
1584
+ ("If this is set, the chat will be formatted so that the final "
1585
+ "message in the chat is open-ended, without any EOS tokens. The "
1586
+ "model will continue this message rather than starting a new one. "
1587
+ "This allows you to \"prefill\" part of the model's response for it. "
1588
+ "Cannot be used at the same time as `add_generation_prompt`."),
1589
+ )
1590
+ add_special_tokens: bool = Field(
1591
+ default=False,
1592
+ description=(
1593
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1594
+ "on top of what is added by the chat template. "
1595
+ "For most models, the chat template takes care of adding the "
1596
+ "special tokens so this should be set to false (as is the "
1597
+ "default)."),
1598
+ )
1599
+ chat_template: Optional[str] = Field(
1600
+ default=None,
1601
+ description=(
1602
+ "A Jinja template to use for this conversion. "
1603
+ "As of transformers v4.44, default chat template is no longer "
1604
+ "allowed, so you must provide a chat template if the tokenizer "
1605
+ "does not define one."),
1606
+ )
1607
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1608
+ default=None,
1609
+ description=(
1610
+ "Additional keyword args to pass to the template renderer. "
1611
+ "Will be accessible by the chat template."),
1612
+ )
1613
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1614
+ default=None,
1615
+ description=("Additional kwargs to pass to the HF processor."),
1616
+ )
1617
+ tools: Optional[list[ChatCompletionToolsParam]] = Field(
1618
+ default=None,
1619
+ description=("A list of tools the model may call."),
1620
+ )
1621
+
1622
+ @model_validator(mode="before")
1623
+ @classmethod
1624
+ def check_generation_prompt(cls, data):
1625
+ if data.get("continue_final_message") and data.get(
1626
+ "add_generation_prompt"):
1627
+ raise ValueError("Cannot set both `continue_final_message` and "
1628
+ "`add_generation_prompt` to True.")
1629
+ return data
1630
+
1631
+
1632
+ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
1633
+
1634
+
1635
+ class TokenizeResponse(OpenAIBaseModel):
1636
+ count: int
1637
+ max_model_len: int
1638
+ tokens: list[int]
1639
+ token_strs: Optional[list[str]] = None
1640
+
1641
+
1642
+ class DetokenizeRequest(OpenAIBaseModel):
1643
+ model: Optional[str] = None
1644
+ tokens: list[int]
1645
+
1646
+
1647
+ class DetokenizeResponse(OpenAIBaseModel):
1648
+ prompt: str
1649
+
1650
+
1651
+ class LoadLoRAAdapterRequest(BaseModel):
1652
+ lora_name: str
1653
+ lora_path: str
1654
+
1655
+
1656
+ class UnloadLoRAAdapterRequest(BaseModel):
1657
+ lora_name: str
1658
+ lora_int_id: Optional[int] = Field(default=None)
1659
+
1660
+
1661
+ ## Protocols for Audio
1662
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
1663
+ "vtt"]
1664
+
1665
+
1666
+ class TranscriptionRequest(OpenAIBaseModel):
1667
+ # Ordered by official OpenAI API documentation
1668
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
1669
+
1670
+ file: UploadFile
1671
+ """
1672
+ The audio file object (not file name) to transcribe, in one of these
1673
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
1674
+ """
1675
+
1676
+ model: Optional[str] = None
1677
+ """ID of the model to use.
1678
+ """
1679
+
1680
+ language: Optional[str] = None
1681
+ """The language of the input audio.
1682
+
1683
+ Supplying the input language in
1684
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
1685
+ will improve accuracy and latency.
1686
+ """
1687
+
1688
+ prompt: str = Field(default="")
1689
+ """An optional text to guide the model's style or continue a previous audio
1690
+ segment.
1691
+
1692
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1693
+ should match the audio language.
1694
+ """
1695
+
1696
+ response_format: AudioResponseFormat = Field(default="json")
1697
+ """
1698
+ The format of the output, in one of these options: `json`, `text`, `srt`,
1699
+ `verbose_json`, or `vtt`.
1700
+ """
1701
+
1702
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
1703
+
1704
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
1705
+ alias="timestamp_granularities[]", default=[])
1706
+ """The timestamp granularities to populate for this transcription.
1707
+
1708
+ `response_format` must be set `verbose_json` to use timestamp granularities.
1709
+ Either or both of these options are supported: `word`, or `segment`. Note:
1710
+ There is no additional latency for segment timestamps, but generating word
1711
+ timestamps incurs additional latency.
1712
+ """
1713
+
1714
+ stream: Optional[bool] = False
1715
+ """When set, it will enable output to be streamed in a similar fashion
1716
+ as the Chat Completion endpoint.
1717
+ """
1718
+ # --8<-- [start:transcription-extra-params]
1719
+ # Flattened stream option to simplify form data.
1720
+ stream_include_usage: Optional[bool] = False
1721
+ stream_continuous_usage_stats: Optional[bool] = False
1722
+
1723
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
1724
+ default=None,
1725
+ description=("Additional request parameters with string or "
1726
+ "numeric values, used by custom extensions."),
1727
+ )
1728
+ # --8<-- [end:transcription-extra-params]
1729
+
1730
+ # --8<-- [start:transcription-sampling-params]
1731
+ temperature: float = Field(default=0.0)
1732
+ """The sampling temperature, between 0 and 1.
1733
+
1734
+ Higher values like 0.8 will make the output more random, while lower values
1735
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
1736
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
1737
+ to automatically increase the temperature until certain thresholds are hit.
1738
+ """
1739
+
1740
+ top_p: Optional[float] = None
1741
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
1742
+ smallest possible set whose cumulative probability exceeds `p`.
1743
+ """
1744
+
1745
+ top_k: Optional[int] = None
1746
+ """Limits sampling to the `k` most probable tokens at each step."""
1747
+
1748
+ min_p: Optional[float] = None
1749
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
1750
+ minimum likelihood threshold during sampling.
1751
+ """
1752
+
1753
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1754
+ """The seed to use for sampling."""
1755
+
1756
+ frequency_penalty: Optional[float] = 0.0
1757
+ """The frequency penalty to use for sampling."""
1758
+
1759
+ repetition_penalty: Optional[float] = None
1760
+ """The repetition penalty to use for sampling."""
1761
+
1762
+ presence_penalty: Optional[float] = 0.0
1763
+ """The presence penalty to use for sampling."""
1764
+ # --8<-- [end:transcription-sampling-params]
1765
+
1766
+ # Default sampling parameters for transcription requests.
1767
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1768
+ "repetition_penalty": 1.0,
1769
+ "temperature": 1.0,
1770
+ "top_p": 1.0,
1771
+ "top_k": 0,
1772
+ "min_p": 0.0,
1773
+ }
1774
+
1775
+ def to_sampling_params(
1776
+ self,
1777
+ default_max_tokens: int,
1778
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
1779
+
1780
+ max_tokens = default_max_tokens
1781
+
1782
+ if default_sampling_params is None:
1783
+ default_sampling_params = {}
1784
+
1785
+ # Default parameters
1786
+ if (temperature := self.temperature) is None:
1787
+ temperature = default_sampling_params.get(
1788
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
1789
+ if (top_p := self.top_p) is None:
1790
+ top_p = default_sampling_params.get(
1791
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
1792
+ if (top_k := self.top_k) is None:
1793
+ top_k = default_sampling_params.get(
1794
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
1795
+ if (min_p := self.min_p) is None:
1796
+ min_p = default_sampling_params.get(
1797
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
1798
+
1799
+ if (repetition_penalty := self.repetition_penalty) is None:
1800
+ repetition_penalty = default_sampling_params.get(
1801
+ "repetition_penalty",
1802
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
1803
+
1804
+ return SamplingParams.from_optional(temperature=temperature,
1805
+ max_tokens=max_tokens,
1806
+ seed=self.seed,
1807
+ top_p=top_p,
1808
+ top_k=top_k,
1809
+ min_p=min_p,
1810
+ frequency_penalty=self.frequency_penalty,
1811
+ repetition_penalty=repetition_penalty,
1812
+ presence_penalty=self.presence_penalty,
1813
+ output_kind=RequestOutputKind.DELTA
1814
+ if self.stream \
1815
+ else RequestOutputKind.FINAL_ONLY,
1816
+ extra_args=self.vllm_xargs)
1817
+
1818
+ @model_validator(mode="before")
1819
+ @classmethod
1820
+ def validate_transcription_request(cls, data):
1821
+ if isinstance(data.get("file"), str):
1822
+ raise HTTPException(
1823
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
1824
+ detail="Expected 'file' to be a file-like object, not 'str'.",
1825
+ )
1826
+
1827
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
1828
+ stream = data.get("stream", False)
1829
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
1830
+ raise ValueError(
1831
+ "Stream options can only be defined when `stream=True`.")
1832
+
1833
+ return data
1834
+
1835
+
1836
+ # Transcription response objects
1837
+ class TranscriptionResponse(OpenAIBaseModel):
1838
+ text: str
1839
+ """The transcribed text."""
1840
+
1841
+
1842
+ class TranscriptionWord(OpenAIBaseModel):
1843
+ end: float
1844
+ """End time of the word in seconds."""
1845
+
1846
+ start: float
1847
+ """Start time of the word in seconds."""
1848
+
1849
+ word: str
1850
+ """The text content of the word."""
1851
+
1852
+
1853
+ class TranscriptionSegment(OpenAIBaseModel):
1854
+ id: int
1855
+ """Unique identifier of the segment."""
1856
+
1857
+ avg_logprob: float
1858
+ """Average logprob of the segment.
1859
+
1860
+ If the value is lower than -1, consider the logprobs failed.
1861
+ """
1862
+
1863
+ compression_ratio: float
1864
+ """Compression ratio of the segment.
1865
+
1866
+ If the value is greater than 2.4, consider the compression failed.
1867
+ """
1868
+
1869
+ end: float
1870
+ """End time of the segment in seconds."""
1871
+
1872
+ no_speech_prob: float
1873
+ """Probability of no speech in the segment.
1874
+
1875
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
1876
+ this segment silent.
1877
+ """
1878
+
1879
+ seek: int
1880
+ """Seek offset of the segment."""
1881
+
1882
+ start: float
1883
+ """Start time of the segment in seconds."""
1884
+
1885
+ temperature: float
1886
+ """Temperature parameter used for generating the segment."""
1887
+
1888
+ text: str
1889
+ """Text content of the segment."""
1890
+
1891
+ tokens: list[int]
1892
+ """Array of token IDs for the text content."""
1893
+
1894
+
1895
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
1896
+ duration: str
1897
+ """The duration of the input audio."""
1898
+
1899
+ language: str
1900
+ """The language of the input audio."""
1901
+
1902
+ text: str
1903
+ """The transcribed text."""
1904
+
1905
+ segments: Optional[list[TranscriptionSegment]] = None
1906
+ """Segments of the transcribed text and their corresponding details."""
1907
+
1908
+ words: Optional[list[TranscriptionWord]] = None
1909
+ """Extracted words and their corresponding timestamps."""
1910
+
1911
+
1912
+ class TranslationResponseStreamChoice(OpenAIBaseModel):
1913
+ delta: DeltaMessage
1914
+ finish_reason: Optional[str] = None
1915
+ stop_reason: Optional[Union[int, str]] = None
1916
+
1917
+
1918
+ class TranslationStreamResponse(OpenAIBaseModel):
1919
+ id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
1920
+ object: Literal["translation.chunk"] = "translation.chunk"
1921
+ created: int = Field(default_factory=lambda: int(time.time()))
1922
+ model: str
1923
+ choices: list[TranslationResponseStreamChoice]
1924
+ usage: Optional[UsageInfo] = Field(default=None)
1925
+
1926
+
1927
+ class TranslationRequest(OpenAIBaseModel):
1928
+ # Ordered by official OpenAI API documentation
1929
+ # https://platform.openai.com/docs/api-reference/audio/createTranslation
1930
+
1931
+ file: UploadFile
1932
+ """
1933
+ The audio file object (not file name) to translate, in one of these
1934
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
1935
+ """
1936
+
1937
+ model: Optional[str] = None
1938
+ """ID of the model to use.
1939
+ """
1940
+
1941
+ prompt: str = Field(default="")
1942
+ """An optional text to guide the model's style or continue a previous audio
1943
+ segment.
1944
+
1945
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1946
+ should match the audio language.
1947
+ """
1948
+
1949
+ response_format: AudioResponseFormat = Field(default="json")
1950
+ """
1951
+ The format of the output, in one of these options: `json`, `text`, `srt`,
1952
+ `verbose_json`, or `vtt`.
1953
+ """
1954
+
1955
+ # TODO support additional sampling parameters
1956
+ # --8<-- [start:translation-sampling-params]
1957
+ temperature: float = Field(default=0.0)
1958
+ """The sampling temperature, between 0 and 1.
1959
+
1960
+ Higher values like 0.8 will make the output more random, while lower values
1961
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
1962
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
1963
+ to automatically increase the temperature until certain thresholds are hit.
1964
+ """
1965
+ # --8<-- [end:translation-sampling-params]
1966
+
1967
+ # --8<-- [start:translation-extra-params]
1968
+ language: Optional[str] = None
1969
+ """The language of the input audio we translate from.
1970
+
1971
+ Supplying the input language in
1972
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
1973
+ will improve accuracy.
1974
+ """
1975
+
1976
+ stream: Optional[bool] = False
1977
+ """Custom field not present in the original OpenAI definition. When set,
1978
+ it will enable output to be streamed in a similar fashion as the Chat
1979
+ Completion endpoint.
1980
+ """
1981
+ # Flattened stream option to simplify form data.
1982
+ stream_include_usage: Optional[bool] = False
1983
+ stream_continuous_usage_stats: Optional[bool] = False
1984
+ # --8<-- [end:translation-extra-params]
1985
+
1986
+ # Default sampling parameters for translation requests.
1987
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1988
+ "temperature": 0,
1989
+ }
1990
+
1991
+ def to_sampling_params(
1992
+ self,
1993
+ default_max_tokens: int,
1994
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
1995
+
1996
+ max_tokens = default_max_tokens
1997
+
1998
+ if default_sampling_params is None:
1999
+ default_sampling_params = {}
2000
+ # Default parameters
2001
+ if (temperature := self.temperature) is None:
2002
+ temperature = default_sampling_params.get(
2003
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
2004
+
2005
+ return SamplingParams.from_optional(temperature=temperature,
2006
+ max_tokens=max_tokens,
2007
+ output_kind=RequestOutputKind.DELTA
2008
+ if self.stream \
2009
+ else RequestOutputKind.FINAL_ONLY)
2010
+
2011
+ @model_validator(mode="before")
2012
+ @classmethod
2013
+ def validate_stream_options(cls, data):
2014
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2015
+ stream = data.get("stream", False)
2016
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2017
+ raise ValueError(
2018
+ "Stream options can only be defined when `stream=True`.")
2019
+
2020
+ return data
2021
+
2022
+
2023
+ # Translation response objects
2024
+ class TranslationResponse(OpenAIBaseModel):
2025
+ text: str
2026
+ """The translated text."""
2027
+
2028
+
2029
+ class TranslationWord(OpenAIBaseModel):
2030
+ end: float
2031
+ """End time of the word in seconds."""
2032
+
2033
+ start: float
2034
+ """Start time of the word in seconds."""
2035
+
2036
+ word: str
2037
+ """The text content of the word."""
2038
+
2039
+
2040
+ class TranslationSegment(OpenAIBaseModel):
2041
+ id: int
2042
+ """Unique identifier of the segment."""
2043
+
2044
+ avg_logprob: float
2045
+ """Average logprob of the segment.
2046
+
2047
+ If the value is lower than -1, consider the logprobs failed.
2048
+ """
2049
+
2050
+ compression_ratio: float
2051
+ """Compression ratio of the segment.
2052
+
2053
+ If the value is greater than 2.4, consider the compression failed.
2054
+ """
2055
+
2056
+ end: float
2057
+ """End time of the segment in seconds."""
2058
+
2059
+ no_speech_prob: float
2060
+ """Probability of no speech in the segment.
2061
+
2062
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2063
+ this segment silent.
2064
+ """
2065
+
2066
+ seek: int
2067
+ """Seek offset of the segment."""
2068
+
2069
+ start: float
2070
+ """Start time of the segment in seconds."""
2071
+
2072
+ temperature: float
2073
+ """Temperature parameter used for generating the segment."""
2074
+
2075
+ text: str
2076
+ """Text content of the segment."""
2077
+
2078
+ tokens: list[int]
2079
+ """Array of token IDs for the text content."""
2080
+
2081
+
2082
+ class TranslationResponseVerbose(OpenAIBaseModel):
2083
+ duration: str
2084
+ """The duration of the input audio."""
2085
+
2086
+ language: str
2087
+ """The language of the input audio."""
2088
+
2089
+ text: str
2090
+ """The translated text."""
2091
+
2092
+ segments: Optional[list[TranslationSegment]] = None
2093
+ """Segments of the translated text and their corresponding details."""
2094
+
2095
+ words: Optional[list[TranslationWord]] = None
2096
+ """Extracted words and their corresponding timestamps."""