vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1398) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2044 -0
  5. vllm/_ipex_ops.py +393 -0
  6. vllm/_version.py +34 -0
  7. vllm/assets/__init__.py +0 -0
  8. vllm/assets/audio.py +45 -0
  9. vllm/assets/base.py +41 -0
  10. vllm/assets/image.py +50 -0
  11. vllm/assets/video.py +145 -0
  12. vllm/attention/__init__.py +15 -0
  13. vllm/attention/backends/__init__.py +0 -0
  14. vllm/attention/backends/abstract.py +204 -0
  15. vllm/attention/backends/utils.py +33 -0
  16. vllm/attention/layer.py +645 -0
  17. vllm/attention/layers/__init__.py +0 -0
  18. vllm/attention/layers/chunked_local_attention.py +93 -0
  19. vllm/attention/layers/cross_attention.py +162 -0
  20. vllm/attention/layers/encoder_only_attention.py +86 -0
  21. vllm/attention/ops/__init__.py +0 -0
  22. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  23. vllm/attention/ops/common.py +345 -0
  24. vllm/attention/ops/flashmla.py +192 -0
  25. vllm/attention/ops/merge_attn_states.py +43 -0
  26. vllm/attention/ops/paged_attn.py +262 -0
  27. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  28. vllm/attention/ops/prefix_prefill.py +928 -0
  29. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  30. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  31. vllm/attention/ops/triton_decode_attention.py +691 -0
  32. vllm/attention/ops/triton_flash_attention.py +984 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +175 -0
  35. vllm/attention/ops/triton_unified_attention.py +894 -0
  36. vllm/attention/selector.py +245 -0
  37. vllm/attention/utils/__init__.py +0 -0
  38. vllm/attention/utils/fa_utils.py +85 -0
  39. vllm/attention/utils/kv_sharing_utils.py +33 -0
  40. vllm/beam_search.py +87 -0
  41. vllm/benchmarks/__init__.py +0 -0
  42. vllm/benchmarks/datasets.py +2723 -0
  43. vllm/benchmarks/latency.py +170 -0
  44. vllm/benchmarks/lib/__init__.py +3 -0
  45. vllm/benchmarks/lib/endpoint_request_func.py +533 -0
  46. vllm/benchmarks/lib/ready_checker.py +73 -0
  47. vllm/benchmarks/lib/utils.py +80 -0
  48. vllm/benchmarks/serve.py +1358 -0
  49. vllm/benchmarks/throughput.py +696 -0
  50. vllm/collect_env.py +823 -0
  51. vllm/compilation/__init__.py +0 -0
  52. vllm/compilation/activation_quant_fusion.py +189 -0
  53. vllm/compilation/backends.py +650 -0
  54. vllm/compilation/base_static_graph.py +56 -0
  55. vllm/compilation/collective_fusion.py +1188 -0
  56. vllm/compilation/compiler_interface.py +573 -0
  57. vllm/compilation/counter.py +47 -0
  58. vllm/compilation/cuda_graph.py +199 -0
  59. vllm/compilation/cuda_piecewise_backend.py +117 -0
  60. vllm/compilation/decorators.py +400 -0
  61. vllm/compilation/fix_functionalization.py +205 -0
  62. vllm/compilation/fusion.py +383 -0
  63. vllm/compilation/fusion_attn.py +295 -0
  64. vllm/compilation/fx_utils.py +84 -0
  65. vllm/compilation/inductor_pass.py +136 -0
  66. vllm/compilation/monitor.py +57 -0
  67. vllm/compilation/noop_elimination.py +158 -0
  68. vllm/compilation/pass_manager.py +125 -0
  69. vllm/compilation/post_cleanup.py +20 -0
  70. vllm/compilation/sequence_parallelism.py +478 -0
  71. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  72. vllm/compilation/vllm_inductor_pass.py +156 -0
  73. vllm/compilation/wrapper.py +136 -0
  74. vllm/config/__init__.py +814 -0
  75. vllm/config/cache.py +220 -0
  76. vllm/config/compilation.py +673 -0
  77. vllm/config/device.py +74 -0
  78. vllm/config/kv_events.py +50 -0
  79. vllm/config/kv_transfer.py +111 -0
  80. vllm/config/load.py +113 -0
  81. vllm/config/lora.py +132 -0
  82. vllm/config/model.py +1912 -0
  83. vllm/config/multimodal.py +129 -0
  84. vllm/config/observability.py +99 -0
  85. vllm/config/parallel.py +524 -0
  86. vllm/config/pooler.py +97 -0
  87. vllm/config/scheduler.py +287 -0
  88. vllm/config/speculative.py +568 -0
  89. vllm/config/speech_to_text.py +39 -0
  90. vllm/config/structured_outputs.py +64 -0
  91. vllm/config/utils.py +145 -0
  92. vllm/connections.py +186 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +311 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +41 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +440 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +317 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +295 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +323 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +28 -0
  106. vllm/distributed/device_communicators/pynccl.py +340 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +186 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +416 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +589 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +635 -0
  113. vllm/distributed/device_communicators/symm_mem.py +136 -0
  114. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  115. vllm/distributed/device_communicators/xpu_communicator.py +94 -0
  116. vllm/distributed/eplb/__init__.py +8 -0
  117. vllm/distributed/eplb/eplb_state.py +620 -0
  118. vllm/distributed/eplb/rebalance_algo.py +239 -0
  119. vllm/distributed/eplb/rebalance_execute.py +424 -0
  120. vllm/distributed/kv_events.py +362 -0
  121. vllm/distributed/kv_transfer/README.md +29 -0
  122. vllm/distributed/kv_transfer/__init__.py +13 -0
  123. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  124. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  125. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  126. vllm/distributed/kv_transfer/kv_connector/factory.py +113 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +261 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +388 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +168 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +100 -0
  132. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +328 -0
  133. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1473 -0
  134. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +485 -0
  135. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +488 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +550 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +267 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +418 -0
  140. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  141. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  142. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  144. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  145. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  146. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  147. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  148. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  149. vllm/distributed/parallel_state.py +1532 -0
  150. vllm/distributed/tpu_distributed_utils.py +178 -0
  151. vllm/distributed/utils.py +536 -0
  152. vllm/engine/__init__.py +0 -0
  153. vllm/engine/arg_utils.py +1778 -0
  154. vllm/engine/async_llm_engine.py +6 -0
  155. vllm/engine/llm_engine.py +6 -0
  156. vllm/engine/metrics.py +577 -0
  157. vllm/engine/metrics_types.py +84 -0
  158. vllm/engine/protocol.py +333 -0
  159. vllm/entrypoints/__init__.py +0 -0
  160. vllm/entrypoints/api_server.py +178 -0
  161. vllm/entrypoints/chat_utils.py +1705 -0
  162. vllm/entrypoints/cli/__init__.py +12 -0
  163. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  164. vllm/entrypoints/cli/benchmark/base.py +25 -0
  165. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  166. vllm/entrypoints/cli/benchmark/main.py +55 -0
  167. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  168. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  169. vllm/entrypoints/cli/collect_env.py +36 -0
  170. vllm/entrypoints/cli/main.py +60 -0
  171. vllm/entrypoints/cli/openai.py +233 -0
  172. vllm/entrypoints/cli/run_batch.py +67 -0
  173. vllm/entrypoints/cli/serve.py +232 -0
  174. vllm/entrypoints/cli/types.py +29 -0
  175. vllm/entrypoints/constants.py +10 -0
  176. vllm/entrypoints/context.py +481 -0
  177. vllm/entrypoints/harmony_utils.py +436 -0
  178. vllm/entrypoints/launcher.py +164 -0
  179. vllm/entrypoints/llm.py +1629 -0
  180. vllm/entrypoints/logger.py +79 -0
  181. vllm/entrypoints/openai/__init__.py +0 -0
  182. vllm/entrypoints/openai/api_server.py +1953 -0
  183. vllm/entrypoints/openai/cli_args.py +288 -0
  184. vllm/entrypoints/openai/logits_processors.py +90 -0
  185. vllm/entrypoints/openai/protocol.py +2757 -0
  186. vllm/entrypoints/openai/run_batch.py +491 -0
  187. vllm/entrypoints/openai/serving_chat.py +1597 -0
  188. vllm/entrypoints/openai/serving_classification.py +173 -0
  189. vllm/entrypoints/openai/serving_completion.py +692 -0
  190. vllm/entrypoints/openai/serving_embedding.py +631 -0
  191. vllm/entrypoints/openai/serving_engine.py +992 -0
  192. vllm/entrypoints/openai/serving_models.py +288 -0
  193. vllm/entrypoints/openai/serving_pooling.py +276 -0
  194. vllm/entrypoints/openai/serving_responses.py +1709 -0
  195. vllm/entrypoints/openai/serving_score.py +479 -0
  196. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  197. vllm/entrypoints/openai/serving_transcription.py +136 -0
  198. vllm/entrypoints/openai/speech_to_text.py +388 -0
  199. vllm/entrypoints/openai/tool_parsers/__init__.py +55 -0
  200. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  201. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  202. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  203. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  204. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  205. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  206. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +455 -0
  207. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  208. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  209. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  210. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  211. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  212. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  213. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +39 -0
  214. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  216. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +93 -0
  217. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  218. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  219. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  220. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1137 -0
  221. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  222. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  223. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  224. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  225. vllm/entrypoints/renderer.py +395 -0
  226. vllm/entrypoints/score_utils.py +232 -0
  227. vllm/entrypoints/ssl.py +75 -0
  228. vllm/entrypoints/tool.py +139 -0
  229. vllm/entrypoints/tool_server.py +206 -0
  230. vllm/entrypoints/utils.py +233 -0
  231. vllm/env_override.py +23 -0
  232. vllm/envs.py +1590 -0
  233. vllm/executor/__init__.py +0 -0
  234. vllm/executor/executor_base.py +381 -0
  235. vllm/executor/msgspec_utils.py +35 -0
  236. vllm/executor/ray_distributed_executor.py +699 -0
  237. vllm/executor/ray_utils.py +410 -0
  238. vllm/executor/uniproc_executor.py +176 -0
  239. vllm/forward_context.py +402 -0
  240. vllm/inputs/__init__.py +30 -0
  241. vllm/inputs/data.py +356 -0
  242. vllm/inputs/parse.py +151 -0
  243. vllm/inputs/preprocess.py +664 -0
  244. vllm/logger.py +229 -0
  245. vllm/logging_utils/__init__.py +10 -0
  246. vllm/logging_utils/dump_input.py +81 -0
  247. vllm/logging_utils/formatter.py +79 -0
  248. vllm/logging_utils/log_time.py +32 -0
  249. vllm/logits_process.py +119 -0
  250. vllm/logprobs.py +28 -0
  251. vllm/lora/__init__.py +0 -0
  252. vllm/lora/layers/__init__.py +34 -0
  253. vllm/lora/layers/base.py +69 -0
  254. vllm/lora/layers/base_linear.py +185 -0
  255. vllm/lora/layers/column_parallel_linear.py +609 -0
  256. vllm/lora/layers/logits_processor.py +247 -0
  257. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  258. vllm/lora/layers/replicated_linear.py +60 -0
  259. vllm/lora/layers/row_parallel_linear.py +196 -0
  260. vllm/lora/layers/utils.py +65 -0
  261. vllm/lora/layers/vocal_parallel_embedding.py +174 -0
  262. vllm/lora/lora_weights.py +199 -0
  263. vllm/lora/models.py +816 -0
  264. vllm/lora/ops/__init__.py +0 -0
  265. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  266. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  267. vllm/lora/ops/torch_ops/__init__.py +16 -0
  268. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  269. vllm/lora/ops/triton_ops/__init__.py +12 -0
  270. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  271. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  272. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  273. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  274. vllm/lora/ops/triton_ops/utils.py +126 -0
  275. vllm/lora/ops/xla_ops/__init__.py +7 -0
  276. vllm/lora/ops/xla_ops/lora_ops.py +144 -0
  277. vllm/lora/peft_helper.py +127 -0
  278. vllm/lora/punica_wrapper/__init__.py +10 -0
  279. vllm/lora/punica_wrapper/punica_base.py +458 -0
  280. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  281. vllm/lora/punica_wrapper/punica_gpu.py +272 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  284. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  285. vllm/lora/punica_wrapper/utils.py +136 -0
  286. vllm/lora/request.py +97 -0
  287. vllm/lora/resolver.py +85 -0
  288. vllm/lora/utils.py +246 -0
  289. vllm/lora/worker_manager.py +267 -0
  290. vllm/model_executor/__init__.py +12 -0
  291. vllm/model_executor/custom_op.py +194 -0
  292. vllm/model_executor/layers/__init__.py +0 -0
  293. vllm/model_executor/layers/activation.py +575 -0
  294. vllm/model_executor/layers/attention_layer_base.py +23 -0
  295. vllm/model_executor/layers/fla/__init__.py +8 -0
  296. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  297. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  298. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  299. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  300. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  301. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  302. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  303. vllm/model_executor/layers/fla/ops/index.py +39 -0
  304. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  305. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  306. vllm/model_executor/layers/fla/ops/op.py +39 -0
  307. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  308. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  309. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  310. vllm/model_executor/layers/fused_moe/__init__.py +89 -0
  311. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +322 -0
  312. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +141 -0
  313. vllm/model_executor/layers/fused_moe/config.py +804 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  545. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +300 -0
  546. vllm/model_executor/layers/fused_moe/cutlass_moe.py +957 -0
  547. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +362 -0
  548. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  549. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +361 -0
  550. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +274 -0
  551. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +268 -0
  552. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +300 -0
  553. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +184 -0
  554. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +993 -0
  555. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +239 -0
  556. vllm/model_executor/layers/fused_moe/fused_moe.py +1890 -0
  557. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +307 -0
  558. vllm/model_executor/layers/fused_moe/layer.py +2195 -0
  559. vllm/model_executor/layers/fused_moe/modular_kernel.py +1038 -0
  560. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  561. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  562. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  563. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  564. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +341 -0
  565. vllm/model_executor/layers/fused_moe/prepare_finalize.py +70 -0
  566. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +424 -0
  567. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  568. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  569. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +143 -0
  570. vllm/model_executor/layers/fused_moe/trtllm_moe.py +191 -0
  571. vllm/model_executor/layers/fused_moe/utils.py +274 -0
  572. vllm/model_executor/layers/layernorm.py +395 -0
  573. vllm/model_executor/layers/lightning_attn.py +661 -0
  574. vllm/model_executor/layers/linear.py +1603 -0
  575. vllm/model_executor/layers/logits_processor.py +106 -0
  576. vllm/model_executor/layers/mamba/__init__.py +0 -0
  577. vllm/model_executor/layers/mamba/abstract.py +42 -0
  578. vllm/model_executor/layers/mamba/linear_attn.py +403 -0
  579. vllm/model_executor/layers/mamba/mamba_mixer.py +466 -0
  580. vllm/model_executor/layers/mamba/mamba_mixer2.py +764 -0
  581. vllm/model_executor/layers/mamba/mamba_utils.py +186 -0
  582. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  583. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1092 -0
  584. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  585. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  586. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +242 -0
  587. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +527 -0
  588. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +724 -0
  589. vllm/model_executor/layers/mamba/ops/ssd_combined.py +238 -0
  590. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +200 -0
  591. vllm/model_executor/layers/mamba/short_conv.py +253 -0
  592. vllm/model_executor/layers/mla.py +173 -0
  593. vllm/model_executor/layers/pooler.py +719 -0
  594. vllm/model_executor/layers/quantization/__init__.py +157 -0
  595. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  596. vllm/model_executor/layers/quantization/awq.py +228 -0
  597. vllm/model_executor/layers/quantization/awq_marlin.py +554 -0
  598. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  599. vllm/model_executor/layers/quantization/base_config.py +170 -0
  600. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  601. vllm/model_executor/layers/quantization/bitsandbytes.py +627 -0
  602. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  603. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +797 -0
  604. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2074 -0
  605. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  606. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  607. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  608. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  609. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  610. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +185 -0
  611. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  612. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  613. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  614. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +157 -0
  615. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  616. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +238 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +153 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +46 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  625. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  626. vllm/model_executor/layers/quantization/experts_int8.py +223 -0
  627. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  628. vllm/model_executor/layers/quantization/fp8.py +1098 -0
  629. vllm/model_executor/layers/quantization/gguf.py +599 -0
  630. vllm/model_executor/layers/quantization/gptq.py +340 -0
  631. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  632. vllm/model_executor/layers/quantization/gptq_marlin.py +751 -0
  633. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  634. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  635. vllm/model_executor/layers/quantization/inc.py +61 -0
  636. vllm/model_executor/layers/quantization/input_quant_fp8.py +156 -0
  637. vllm/model_executor/layers/quantization/ipex_quant.py +415 -0
  638. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  639. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  640. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  641. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  642. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  643. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  644. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  645. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  646. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  647. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  648. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  649. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  650. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  651. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +161 -0
  652. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  653. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  654. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  655. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  656. vllm/model_executor/layers/quantization/kv_cache.py +143 -0
  657. vllm/model_executor/layers/quantization/modelopt.py +1596 -0
  658. vllm/model_executor/layers/quantization/moe_wna16.py +484 -0
  659. vllm/model_executor/layers/quantization/mxfp4.py +988 -0
  660. vllm/model_executor/layers/quantization/petit.py +306 -0
  661. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  662. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  663. vllm/model_executor/layers/quantization/quark/quark.py +432 -0
  664. vllm/model_executor/layers/quantization/quark/quark_moe.py +561 -0
  665. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  666. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  667. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +239 -0
  668. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  669. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  670. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  671. vllm/model_executor/layers/quantization/rtn.py +466 -0
  672. vllm/model_executor/layers/quantization/schema.py +86 -0
  673. vllm/model_executor/layers/quantization/torchao.py +214 -0
  674. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  675. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  676. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  677. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  888. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  889. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +79 -0
  890. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +248 -0
  891. vllm/model_executor/layers/quantization/utils/fp8_utils.py +949 -0
  892. vllm/model_executor/layers/quantization/utils/gptq_utils.py +146 -0
  893. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  894. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  895. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  896. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  897. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  898. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  899. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  900. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  901. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +141 -0
  902. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  903. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  904. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  905. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  906. vllm/model_executor/layers/quantization/utils/quant_utils.py +641 -0
  907. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  908. vllm/model_executor/layers/resampler.py +270 -0
  909. vllm/model_executor/layers/rotary_embedding/__init__.py +204 -0
  910. vllm/model_executor/layers/rotary_embedding/base.py +177 -0
  911. vllm/model_executor/layers/rotary_embedding/common.py +150 -0
  912. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +138 -0
  913. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  914. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  915. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  916. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  917. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  918. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  919. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  920. vllm/model_executor/layers/rotary_embedding/mrope.py +1321 -0
  921. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  922. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  923. vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py +86 -0
  924. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  925. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  926. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  927. vllm/model_executor/layers/utils.py +195 -0
  928. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  929. vllm/model_executor/model_loader/__init__.py +138 -0
  930. vllm/model_executor/model_loader/base_loader.py +52 -0
  931. vllm/model_executor/model_loader/bitsandbytes_loader.py +788 -0
  932. vllm/model_executor/model_loader/default_loader.py +277 -0
  933. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  934. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  935. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  936. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  937. vllm/model_executor/model_loader/tensorizer.py +738 -0
  938. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  939. vllm/model_executor/model_loader/tpu.py +114 -0
  940. vllm/model_executor/model_loader/utils.py +292 -0
  941. vllm/model_executor/model_loader/weight_utils.py +990 -0
  942. vllm/model_executor/models/__init__.py +33 -0
  943. vllm/model_executor/models/adapters.py +542 -0
  944. vllm/model_executor/models/aimv2.py +246 -0
  945. vllm/model_executor/models/apertus.py +579 -0
  946. vllm/model_executor/models/arcee.py +422 -0
  947. vllm/model_executor/models/arctic.py +558 -0
  948. vllm/model_executor/models/aria.py +650 -0
  949. vllm/model_executor/models/aya_vision.py +468 -0
  950. vllm/model_executor/models/baichuan.py +474 -0
  951. vllm/model_executor/models/bailing_moe.py +642 -0
  952. vllm/model_executor/models/bamba.py +514 -0
  953. vllm/model_executor/models/bert.py +665 -0
  954. vllm/model_executor/models/bert_with_rope.py +687 -0
  955. vllm/model_executor/models/blip.py +339 -0
  956. vllm/model_executor/models/blip2.py +712 -0
  957. vllm/model_executor/models/bloom.py +374 -0
  958. vllm/model_executor/models/chameleon.py +1139 -0
  959. vllm/model_executor/models/chatglm.py +476 -0
  960. vllm/model_executor/models/clip.py +407 -0
  961. vllm/model_executor/models/cohere2_vision.py +481 -0
  962. vllm/model_executor/models/commandr.py +465 -0
  963. vllm/model_executor/models/config.py +445 -0
  964. vllm/model_executor/models/dbrx.py +471 -0
  965. vllm/model_executor/models/deepseek.py +497 -0
  966. vllm/model_executor/models/deepseek_eagle.py +240 -0
  967. vllm/model_executor/models/deepseek_mtp.py +289 -0
  968. vllm/model_executor/models/deepseek_v2.py +1444 -0
  969. vllm/model_executor/models/deepseek_vl2.py +658 -0
  970. vllm/model_executor/models/dots1.py +546 -0
  971. vllm/model_executor/models/dots_ocr.py +873 -0
  972. vllm/model_executor/models/ernie45.py +43 -0
  973. vllm/model_executor/models/ernie45_moe.py +607 -0
  974. vllm/model_executor/models/ernie45_vl.py +1527 -0
  975. vllm/model_executor/models/ernie45_vl_moe.py +727 -0
  976. vllm/model_executor/models/ernie_mtp.py +268 -0
  977. vllm/model_executor/models/exaone.py +550 -0
  978. vllm/model_executor/models/exaone4.py +533 -0
  979. vllm/model_executor/models/fairseq2_llama.py +154 -0
  980. vllm/model_executor/models/falcon.py +509 -0
  981. vllm/model_executor/models/falcon_h1.py +674 -0
  982. vllm/model_executor/models/fuyu.py +399 -0
  983. vllm/model_executor/models/gemma.py +425 -0
  984. vllm/model_executor/models/gemma2.py +422 -0
  985. vllm/model_executor/models/gemma3.py +555 -0
  986. vllm/model_executor/models/gemma3_mm.py +721 -0
  987. vllm/model_executor/models/gemma3n.py +1113 -0
  988. vllm/model_executor/models/gemma3n_mm.py +761 -0
  989. vllm/model_executor/models/glm.py +23 -0
  990. vllm/model_executor/models/glm4.py +304 -0
  991. vllm/model_executor/models/glm4_1v.py +1690 -0
  992. vllm/model_executor/models/glm4_moe.py +727 -0
  993. vllm/model_executor/models/glm4_moe_mtp.py +301 -0
  994. vllm/model_executor/models/glm4v.py +654 -0
  995. vllm/model_executor/models/gpt2.py +380 -0
  996. vllm/model_executor/models/gpt_bigcode.py +344 -0
  997. vllm/model_executor/models/gpt_j.py +339 -0
  998. vllm/model_executor/models/gpt_neox.py +330 -0
  999. vllm/model_executor/models/gpt_oss.py +712 -0
  1000. vllm/model_executor/models/granite.py +489 -0
  1001. vllm/model_executor/models/granite_speech.py +794 -0
  1002. vllm/model_executor/models/granitemoe.py +550 -0
  1003. vllm/model_executor/models/granitemoehybrid.py +614 -0
  1004. vllm/model_executor/models/granitemoeshared.py +332 -0
  1005. vllm/model_executor/models/gritlm.py +262 -0
  1006. vllm/model_executor/models/grok1.py +547 -0
  1007. vllm/model_executor/models/h2ovl.py +536 -0
  1008. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1009. vllm/model_executor/models/hyperclovax_vision.py +1192 -0
  1010. vllm/model_executor/models/idefics2_vision_model.py +417 -0
  1011. vllm/model_executor/models/idefics3.py +756 -0
  1012. vllm/model_executor/models/interfaces.py +959 -0
  1013. vllm/model_executor/models/interfaces_base.py +192 -0
  1014. vllm/model_executor/models/intern_vit.py +441 -0
  1015. vllm/model_executor/models/internlm2.py +450 -0
  1016. vllm/model_executor/models/internlm2_ve.py +148 -0
  1017. vllm/model_executor/models/interns1.py +838 -0
  1018. vllm/model_executor/models/interns1_vit.py +418 -0
  1019. vllm/model_executor/models/internvl.py +1423 -0
  1020. vllm/model_executor/models/jais.py +373 -0
  1021. vllm/model_executor/models/jamba.py +591 -0
  1022. vllm/model_executor/models/jina_vl.py +144 -0
  1023. vllm/model_executor/models/keye.py +1680 -0
  1024. vllm/model_executor/models/keye_vl1_5.py +602 -0
  1025. vllm/model_executor/models/kimi_vl.py +618 -0
  1026. vllm/model_executor/models/lfm2.py +548 -0
  1027. vllm/model_executor/models/llama.py +669 -0
  1028. vllm/model_executor/models/llama4.py +746 -0
  1029. vllm/model_executor/models/llama4_eagle.py +239 -0
  1030. vllm/model_executor/models/llama_eagle.py +179 -0
  1031. vllm/model_executor/models/llama_eagle3.py +296 -0
  1032. vllm/model_executor/models/llava.py +870 -0
  1033. vllm/model_executor/models/llava_next.py +571 -0
  1034. vllm/model_executor/models/llava_next_video.py +476 -0
  1035. vllm/model_executor/models/llava_onevision.py +942 -0
  1036. vllm/model_executor/models/longcat_flash.py +715 -0
  1037. vllm/model_executor/models/longcat_flash_mtp.py +352 -0
  1038. vllm/model_executor/models/mamba.py +275 -0
  1039. vllm/model_executor/models/mamba2.py +291 -0
  1040. vllm/model_executor/models/medusa.py +169 -0
  1041. vllm/model_executor/models/midashenglm.py +792 -0
  1042. vllm/model_executor/models/mimo.py +188 -0
  1043. vllm/model_executor/models/mimo_mtp.py +280 -0
  1044. vllm/model_executor/models/minicpm.py +631 -0
  1045. vllm/model_executor/models/minicpm3.py +230 -0
  1046. vllm/model_executor/models/minicpm_eagle.py +389 -0
  1047. vllm/model_executor/models/minicpmo.py +770 -0
  1048. vllm/model_executor/models/minicpmv.py +1784 -0
  1049. vllm/model_executor/models/minimax_text_01.py +986 -0
  1050. vllm/model_executor/models/minimax_vl_01.py +426 -0
  1051. vllm/model_executor/models/mistral3.py +628 -0
  1052. vllm/model_executor/models/mixtral.py +606 -0
  1053. vllm/model_executor/models/mllama4.py +1076 -0
  1054. vllm/model_executor/models/mlp_speculator.py +206 -0
  1055. vllm/model_executor/models/modernbert.py +374 -0
  1056. vllm/model_executor/models/module_mapping.py +72 -0
  1057. vllm/model_executor/models/molmo.py +1567 -0
  1058. vllm/model_executor/models/moonvit.py +673 -0
  1059. vllm/model_executor/models/motif.py +345 -0
  1060. vllm/model_executor/models/mpt.py +329 -0
  1061. vllm/model_executor/models/nano_nemotron_vl.py +1394 -0
  1062. vllm/model_executor/models/nemotron.py +507 -0
  1063. vllm/model_executor/models/nemotron_h.py +565 -0
  1064. vllm/model_executor/models/nemotron_nas.py +481 -0
  1065. vllm/model_executor/models/nemotron_vl.py +652 -0
  1066. vllm/model_executor/models/nvlm_d.py +203 -0
  1067. vllm/model_executor/models/olmo.py +404 -0
  1068. vllm/model_executor/models/olmo2.py +439 -0
  1069. vllm/model_executor/models/olmoe.py +483 -0
  1070. vllm/model_executor/models/opt.py +412 -0
  1071. vllm/model_executor/models/orion.py +348 -0
  1072. vllm/model_executor/models/ovis.py +559 -0
  1073. vllm/model_executor/models/ovis2_5.py +642 -0
  1074. vllm/model_executor/models/paligemma.py +411 -0
  1075. vllm/model_executor/models/persimmon.py +343 -0
  1076. vllm/model_executor/models/phi.py +356 -0
  1077. vllm/model_executor/models/phi3.py +19 -0
  1078. vllm/model_executor/models/phi3v.py +698 -0
  1079. vllm/model_executor/models/phi4_multimodal.py +1475 -0
  1080. vllm/model_executor/models/phi4mm.py +1279 -0
  1081. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1082. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1083. vllm/model_executor/models/phimoe.py +679 -0
  1084. vllm/model_executor/models/pixtral.py +1345 -0
  1085. vllm/model_executor/models/plamo2.py +978 -0
  1086. vllm/model_executor/models/qwen.py +361 -0
  1087. vllm/model_executor/models/qwen2.py +523 -0
  1088. vllm/model_executor/models/qwen2_5_omni_thinker.py +984 -0
  1089. vllm/model_executor/models/qwen2_5_vl.py +1481 -0
  1090. vllm/model_executor/models/qwen2_audio.py +489 -0
  1091. vllm/model_executor/models/qwen2_moe.py +558 -0
  1092. vllm/model_executor/models/qwen2_rm.py +122 -0
  1093. vllm/model_executor/models/qwen2_vl.py +1670 -0
  1094. vllm/model_executor/models/qwen3.py +341 -0
  1095. vllm/model_executor/models/qwen3_moe.py +692 -0
  1096. vllm/model_executor/models/qwen3_next.py +1266 -0
  1097. vllm/model_executor/models/qwen3_next_mtp.py +281 -0
  1098. vllm/model_executor/models/qwen3_vl.py +1613 -0
  1099. vllm/model_executor/models/qwen3_vl_moe.py +358 -0
  1100. vllm/model_executor/models/qwen_vl.py +795 -0
  1101. vllm/model_executor/models/radio.py +576 -0
  1102. vllm/model_executor/models/registry.py +990 -0
  1103. vllm/model_executor/models/roberta.py +252 -0
  1104. vllm/model_executor/models/rvl.py +103 -0
  1105. vllm/model_executor/models/seed_oss.py +485 -0
  1106. vllm/model_executor/models/siglip.py +540 -0
  1107. vllm/model_executor/models/siglip2navit.py +689 -0
  1108. vllm/model_executor/models/skyworkr1v.py +911 -0
  1109. vllm/model_executor/models/smolvlm.py +44 -0
  1110. vllm/model_executor/models/solar.py +504 -0
  1111. vllm/model_executor/models/stablelm.py +341 -0
  1112. vllm/model_executor/models/starcoder2.py +354 -0
  1113. vllm/model_executor/models/step3_text.py +510 -0
  1114. vllm/model_executor/models/step3_vl.py +1072 -0
  1115. vllm/model_executor/models/swin.py +475 -0
  1116. vllm/model_executor/models/tarsier.py +639 -0
  1117. vllm/model_executor/models/telechat2.py +151 -0
  1118. vllm/model_executor/models/teleflm.py +79 -0
  1119. vllm/model_executor/models/terratorch.py +294 -0
  1120. vllm/model_executor/models/transformers.py +948 -0
  1121. vllm/model_executor/models/ultravox.py +654 -0
  1122. vllm/model_executor/models/utils.py +808 -0
  1123. vllm/model_executor/models/vision.py +404 -0
  1124. vllm/model_executor/models/voxtral.py +786 -0
  1125. vllm/model_executor/models/whisper.py +963 -0
  1126. vllm/model_executor/models/zamba2.py +960 -0
  1127. vllm/model_executor/parameter.py +620 -0
  1128. vllm/model_executor/utils.py +86 -0
  1129. vllm/model_executor/warmup/__init__.py +0 -0
  1130. vllm/model_executor/warmup/deep_gemm_warmup.py +230 -0
  1131. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1132. vllm/multimodal/__init__.py +33 -0
  1133. vllm/multimodal/audio.py +116 -0
  1134. vllm/multimodal/base.py +27 -0
  1135. vllm/multimodal/cache.py +697 -0
  1136. vllm/multimodal/evs.py +273 -0
  1137. vllm/multimodal/hasher.py +102 -0
  1138. vllm/multimodal/image.py +130 -0
  1139. vllm/multimodal/inputs.py +987 -0
  1140. vllm/multimodal/parse.py +511 -0
  1141. vllm/multimodal/processing.py +2148 -0
  1142. vllm/multimodal/profiling.py +284 -0
  1143. vllm/multimodal/registry.py +345 -0
  1144. vllm/multimodal/utils.py +503 -0
  1145. vllm/multimodal/video.py +319 -0
  1146. vllm/outputs.py +324 -0
  1147. vllm/platforms/__init__.py +263 -0
  1148. vllm/platforms/cpu.py +340 -0
  1149. vllm/platforms/cuda.py +668 -0
  1150. vllm/platforms/interface.py +620 -0
  1151. vllm/platforms/rocm.py +497 -0
  1152. vllm/platforms/tpu.py +233 -0
  1153. vllm/platforms/xpu.py +243 -0
  1154. vllm/plugins/__init__.py +72 -0
  1155. vllm/plugins/io_processors/__init__.py +68 -0
  1156. vllm/plugins/io_processors/interface.py +67 -0
  1157. vllm/plugins/lora_resolvers/README.md +16 -0
  1158. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1159. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1160. vllm/pooling_params.py +191 -0
  1161. vllm/profiler/__init__.py +0 -0
  1162. vllm/profiler/layerwise_profile.py +375 -0
  1163. vllm/profiler/utils.py +148 -0
  1164. vllm/py.typed +2 -0
  1165. vllm/ray/__init__.py +0 -0
  1166. vllm/ray/lazy_utils.py +22 -0
  1167. vllm/ray/ray_env.py +72 -0
  1168. vllm/reasoning/__init__.py +29 -0
  1169. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1170. vllm/reasoning/basic_parsers.py +156 -0
  1171. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1172. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1173. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1174. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1175. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1176. vllm/reasoning/mistral_reasoning_parser.py +56 -0
  1177. vllm/reasoning/qwen3_reasoning_parser.py +72 -0
  1178. vllm/reasoning/seedoss_reasoning_parser.py +28 -0
  1179. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1180. vllm/sampling_params.py +593 -0
  1181. vllm/scalar_type.py +349 -0
  1182. vllm/scripts.py +15 -0
  1183. vllm/sequence.py +103 -0
  1184. vllm/tasks.py +11 -0
  1185. vllm/test_utils.py +129 -0
  1186. vllm/third_party/__init__.py +0 -0
  1187. vllm/third_party/pynvml.py +6140 -0
  1188. vllm/tracing.py +136 -0
  1189. vllm/transformers_utils/__init__.py +24 -0
  1190. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1191. vllm/transformers_utils/chat_templates/registry.py +70 -0
  1192. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1193. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1194. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1195. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1196. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1197. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1198. vllm/transformers_utils/config.py +1102 -0
  1199. vllm/transformers_utils/config_parser_base.py +20 -0
  1200. vllm/transformers_utils/configs/__init__.py +63 -0
  1201. vllm/transformers_utils/configs/arctic.py +207 -0
  1202. vllm/transformers_utils/configs/chatglm.py +72 -0
  1203. vllm/transformers_utils/configs/deepseek_v3.py +101 -0
  1204. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1205. vllm/transformers_utils/configs/dotsocr.py +69 -0
  1206. vllm/transformers_utils/configs/eagle.py +84 -0
  1207. vllm/transformers_utils/configs/falcon.py +90 -0
  1208. vllm/transformers_utils/configs/jais.py +237 -0
  1209. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1210. vllm/transformers_utils/configs/medusa.py +63 -0
  1211. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1212. vllm/transformers_utils/configs/mistral.py +165 -0
  1213. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1214. vllm/transformers_utils/configs/moonvit.py +33 -0
  1215. vllm/transformers_utils/configs/nemotron.py +205 -0
  1216. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1217. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1218. vllm/transformers_utils/configs/olmo3.py +80 -0
  1219. vllm/transformers_utils/configs/ovis.py +176 -0
  1220. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1221. vllm/transformers_utils/configs/radio.py +91 -0
  1222. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1223. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1224. vllm/transformers_utils/configs/speculators/base.py +111 -0
  1225. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1226. vllm/transformers_utils/configs/ultravox.py +116 -0
  1227. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1228. vllm/transformers_utils/dynamic_module.py +60 -0
  1229. vllm/transformers_utils/processor.py +299 -0
  1230. vllm/transformers_utils/processors/__init__.py +16 -0
  1231. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1232. vllm/transformers_utils/processors/ovis.py +420 -0
  1233. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1234. vllm/transformers_utils/runai_utils.py +104 -0
  1235. vllm/transformers_utils/s3_utils.py +93 -0
  1236. vllm/transformers_utils/tokenizer.py +292 -0
  1237. vllm/transformers_utils/tokenizer_base.py +154 -0
  1238. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1239. vllm/transformers_utils/tokenizers/mistral.py +521 -0
  1240. vllm/transformers_utils/utils.py +108 -0
  1241. vllm/triton_utils/__init__.py +16 -0
  1242. vllm/triton_utils/importing.py +96 -0
  1243. vllm/usage/__init__.py +0 -0
  1244. vllm/usage/usage_lib.py +259 -0
  1245. vllm/utils/__init__.py +3566 -0
  1246. vllm/utils/deep_gemm.py +319 -0
  1247. vllm/utils/flashinfer.py +443 -0
  1248. vllm/utils/jsontree.py +178 -0
  1249. vllm/utils/tensor_schema.py +235 -0
  1250. vllm/v1/__init__.py +0 -0
  1251. vllm/v1/attention/__init__.py +0 -0
  1252. vllm/v1/attention/backends/__init__.py +0 -0
  1253. vllm/v1/attention/backends/cpu_attn.py +919 -0
  1254. vllm/v1/attention/backends/flash_attn.py +795 -0
  1255. vllm/v1/attention/backends/flashinfer.py +1181 -0
  1256. vllm/v1/attention/backends/flex_attention.py +861 -0
  1257. vllm/v1/attention/backends/gdn_attn.py +332 -0
  1258. vllm/v1/attention/backends/linear_attn.py +67 -0
  1259. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1260. vllm/v1/attention/backends/mamba2_attn.py +232 -0
  1261. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1262. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1263. vllm/v1/attention/backends/mla/common.py +1783 -0
  1264. vllm/v1/attention/backends/mla/cutlass_mla.py +248 -0
  1265. vllm/v1/attention/backends/mla/flashattn_mla.py +271 -0
  1266. vllm/v1/attention/backends/mla/flashinfer_mla.py +114 -0
  1267. vllm/v1/attention/backends/mla/flashmla.py +203 -0
  1268. vllm/v1/attention/backends/mla/flashmla_sparse.py +544 -0
  1269. vllm/v1/attention/backends/mla/indexer.py +342 -0
  1270. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1271. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1272. vllm/v1/attention/backends/pallas.py +409 -0
  1273. vllm/v1/attention/backends/rocm_aiter_fa.py +549 -0
  1274. vllm/v1/attention/backends/rocm_attn.py +426 -0
  1275. vllm/v1/attention/backends/short_conv_attn.py +94 -0
  1276. vllm/v1/attention/backends/tree_attn.py +451 -0
  1277. vllm/v1/attention/backends/triton_attn.py +361 -0
  1278. vllm/v1/attention/backends/utils.py +990 -0
  1279. vllm/v1/attention/backends/xformers.py +438 -0
  1280. vllm/v1/core/__init__.py +0 -0
  1281. vllm/v1/core/block_pool.py +416 -0
  1282. vllm/v1/core/encoder_cache_manager.py +333 -0
  1283. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1284. vllm/v1/core/kv_cache_manager.py +399 -0
  1285. vllm/v1/core/kv_cache_utils.py +1291 -0
  1286. vllm/v1/core/sched/__init__.py +0 -0
  1287. vllm/v1/core/sched/async_scheduler.py +47 -0
  1288. vllm/v1/core/sched/interface.py +158 -0
  1289. vllm/v1/core/sched/output.py +166 -0
  1290. vllm/v1/core/sched/request_queue.py +224 -0
  1291. vllm/v1/core/sched/scheduler.py +1296 -0
  1292. vllm/v1/core/sched/utils.py +69 -0
  1293. vllm/v1/core/single_type_kv_cache_manager.py +671 -0
  1294. vllm/v1/cudagraph_dispatcher.py +125 -0
  1295. vllm/v1/engine/__init__.py +203 -0
  1296. vllm/v1/engine/async_llm.py +742 -0
  1297. vllm/v1/engine/coordinator.py +357 -0
  1298. vllm/v1/engine/core.py +1235 -0
  1299. vllm/v1/engine/core_client.py +1334 -0
  1300. vllm/v1/engine/detokenizer.py +349 -0
  1301. vllm/v1/engine/exceptions.py +17 -0
  1302. vllm/v1/engine/llm_engine.py +370 -0
  1303. vllm/v1/engine/logprobs.py +201 -0
  1304. vllm/v1/engine/output_processor.py +576 -0
  1305. vllm/v1/engine/parallel_sampling.py +133 -0
  1306. vllm/v1/engine/processor.py +545 -0
  1307. vllm/v1/engine/utils.py +860 -0
  1308. vllm/v1/executor/__init__.py +0 -0
  1309. vllm/v1/executor/abstract.py +137 -0
  1310. vllm/v1/executor/multiproc_executor.py +726 -0
  1311. vllm/v1/executor/ray_distributed_executor.py +108 -0
  1312. vllm/v1/executor/utils.py +23 -0
  1313. vllm/v1/kv_cache_interface.py +375 -0
  1314. vllm/v1/kv_offload/__init__.py +0 -0
  1315. vllm/v1/kv_offload/abstract.py +165 -0
  1316. vllm/v1/kv_offload/backend.py +96 -0
  1317. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1318. vllm/v1/kv_offload/backends/cpu.py +61 -0
  1319. vllm/v1/kv_offload/cpu.py +75 -0
  1320. vllm/v1/kv_offload/factory.py +56 -0
  1321. vllm/v1/kv_offload/lru_manager.py +132 -0
  1322. vllm/v1/kv_offload/mediums.py +39 -0
  1323. vllm/v1/kv_offload/spec.py +61 -0
  1324. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1325. vllm/v1/kv_offload/worker/cpu_gpu.py +171 -0
  1326. vllm/v1/kv_offload/worker/worker.py +142 -0
  1327. vllm/v1/metrics/__init__.py +0 -0
  1328. vllm/v1/metrics/loggers.py +741 -0
  1329. vllm/v1/metrics/prometheus.py +82 -0
  1330. vllm/v1/metrics/ray_wrappers.py +152 -0
  1331. vllm/v1/metrics/reader.py +246 -0
  1332. vllm/v1/metrics/stats.py +257 -0
  1333. vllm/v1/outputs.py +161 -0
  1334. vllm/v1/pool/__init__.py +0 -0
  1335. vllm/v1/pool/metadata.py +77 -0
  1336. vllm/v1/request.py +241 -0
  1337. vllm/v1/sample/__init__.py +0 -0
  1338. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1339. vllm/v1/sample/logits_processor/builtin.py +275 -0
  1340. vllm/v1/sample/logits_processor/interface.py +97 -0
  1341. vllm/v1/sample/logits_processor/state.py +161 -0
  1342. vllm/v1/sample/metadata.py +43 -0
  1343. vllm/v1/sample/ops/__init__.py +0 -0
  1344. vllm/v1/sample/ops/bad_words.py +39 -0
  1345. vllm/v1/sample/ops/logprobs.py +26 -0
  1346. vllm/v1/sample/ops/penalties.py +43 -0
  1347. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1348. vllm/v1/sample/rejection_sampler.py +623 -0
  1349. vllm/v1/sample/sampler.py +285 -0
  1350. vllm/v1/sample/tpu/__init__.py +0 -0
  1351. vllm/v1/sample/tpu/metadata.py +124 -0
  1352. vllm/v1/sample/tpu/sampler.py +213 -0
  1353. vllm/v1/serial_utils.py +423 -0
  1354. vllm/v1/spec_decode/__init__.py +0 -0
  1355. vllm/v1/spec_decode/eagle.py +1011 -0
  1356. vllm/v1/spec_decode/medusa.py +66 -0
  1357. vllm/v1/spec_decode/metadata.py +62 -0
  1358. vllm/v1/spec_decode/metrics.py +211 -0
  1359. vllm/v1/spec_decode/ngram_proposer.py +276 -0
  1360. vllm/v1/spec_decode/utils.py +14 -0
  1361. vllm/v1/structured_output/__init__.py +295 -0
  1362. vllm/v1/structured_output/backend_guidance.py +245 -0
  1363. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1364. vllm/v1/structured_output/backend_outlines.py +320 -0
  1365. vllm/v1/structured_output/backend_types.py +134 -0
  1366. vllm/v1/structured_output/backend_xgrammar.py +327 -0
  1367. vllm/v1/structured_output/request.py +86 -0
  1368. vllm/v1/structured_output/utils.py +454 -0
  1369. vllm/v1/utils.py +396 -0
  1370. vllm/v1/worker/__init__.py +0 -0
  1371. vllm/v1/worker/block_table.py +210 -0
  1372. vllm/v1/worker/cpu_model_runner.py +175 -0
  1373. vllm/v1/worker/cpu_worker.py +156 -0
  1374. vllm/v1/worker/gpu_input_batch.py +863 -0
  1375. vllm/v1/worker/gpu_model_runner.py +4160 -0
  1376. vllm/v1/worker/gpu_ubatch_wrapper.py +399 -0
  1377. vllm/v1/worker/gpu_worker.py +710 -0
  1378. vllm/v1/worker/kv_connector_model_runner_mixin.py +132 -0
  1379. vllm/v1/worker/lora_model_runner_mixin.py +183 -0
  1380. vllm/v1/worker/tpu_input_batch.py +587 -0
  1381. vllm/v1/worker/tpu_model_runner.py +1946 -0
  1382. vllm/v1/worker/tpu_worker.py +346 -0
  1383. vllm/v1/worker/ubatch_splitting.py +192 -0
  1384. vllm/v1/worker/ubatch_utils.py +27 -0
  1385. vllm/v1/worker/ubatching.py +224 -0
  1386. vllm/v1/worker/utils.py +344 -0
  1387. vllm/v1/worker/worker_base.py +65 -0
  1388. vllm/v1/worker/xpu_model_runner.py +57 -0
  1389. vllm/v1/worker/xpu_worker.py +179 -0
  1390. vllm/version.py +41 -0
  1391. vllm/vllm_flash_attn/.gitkeep +0 -0
  1392. vllm/worker/__init__.py +0 -0
  1393. vllm/worker/worker_base.py +279 -0
  1394. vllm_cpu-0.11.0.post2.dist-info/METADATA +348 -0
  1395. vllm_cpu-0.11.0.post2.dist-info/RECORD +1398 -0
  1396. vllm_cpu-0.11.0.post2.dist-info/WHEEL +5 -0
  1397. vllm_cpu-0.11.0.post2.dist-info/entry_points.txt +5 -0
  1398. vllm_cpu-0.11.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2757 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Adapted from
5
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
6
+ import json
7
+ import time
8
+ from http import HTTPStatus
9
+ from typing import (Annotated, Any, ClassVar, Generic, Literal, Optional,
10
+ TypeVar, Union)
11
+
12
+ import regex as re
13
+ import torch
14
+ from fastapi import HTTPException, UploadFile
15
+ # yapf: disable
16
+ from openai.types.chat.chat_completion_audio import (
17
+ ChatCompletionAudio as OpenAIChatCompletionAudio)
18
+ from openai.types.chat.chat_completion_message import (
19
+ Annotation as OpenAIAnnotation)
20
+ # yapf: enable
21
+ from openai.types.responses import (
22
+ ResponseCodeInterpreterCallCodeDeltaEvent,
23
+ ResponseCodeInterpreterCallCodeDoneEvent,
24
+ ResponseCodeInterpreterCallCompletedEvent,
25
+ ResponseCodeInterpreterCallInProgressEvent,
26
+ ResponseCodeInterpreterCallInterpretingEvent, ResponseCompletedEvent,
27
+ ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
28
+ ResponseCreatedEvent, ResponseFunctionToolCall, ResponseInProgressEvent,
29
+ ResponseInputItemParam, ResponseOutputItem, ResponseOutputItemAddedEvent,
30
+ ResponseOutputItemDoneEvent, ResponsePrompt, ResponseReasoningItem,
31
+ ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
32
+ ResponseStatus, ResponseWebSearchCallCompletedEvent,
33
+ ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
34
+ from openai.types.responses.response_reasoning_item import (
35
+ Content as ResponseReasoningTextContent)
36
+
37
+ # Backward compatibility for OpenAI client versions
38
+ try: # For older openai versions (< 1.100.0)
39
+ from openai.types.responses import ResponseTextConfig
40
+ except ImportError: # For newer openai versions (>= 1.100.0)
41
+ from openai.types.responses import (ResponseFormatTextConfig as
42
+ ResponseTextConfig)
43
+
44
+ from openai.types.responses.response import IncompleteDetails, ToolChoice
45
+ from openai.types.responses.tool import Tool
46
+ from openai.types.shared import Metadata, Reasoning
47
+ from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
48
+ ValidationInfo, field_validator, model_validator)
49
+ from typing_extensions import TypeAlias
50
+
51
+ from vllm import envs
52
+ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
53
+ make_tool_call_id)
54
+ from vllm.entrypoints.score_utils import (ScoreContentPartParam,
55
+ ScoreMultiModalParam)
56
+ from vllm.logger import init_logger
57
+ from vllm.logprobs import Logprob
58
+ from vllm.pooling_params import PoolingParams
59
+ from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
60
+ SamplingParams, StructuredOutputsParams)
61
+ from vllm.utils import random_uuid, resolve_obj_by_qualname
62
+
63
+ logger = init_logger(__name__)
64
+
65
+ _LONG_INFO = torch.iinfo(torch.long)
66
+
67
+
68
+ class OpenAIBaseModel(BaseModel):
69
+ # OpenAI API does allow extra fields
70
+ model_config = ConfigDict(extra="allow")
71
+
72
+ # Cache class field names
73
+ field_names: ClassVar[Optional[set[str]]] = None
74
+
75
+ @model_validator(mode="wrap")
76
+ @classmethod
77
+ def __log_extra_fields__(cls, data, handler):
78
+ result = handler(data)
79
+ if not isinstance(data, dict):
80
+ return result
81
+ field_names = cls.field_names
82
+ if field_names is None:
83
+ # Get all class field names and their potential aliases
84
+ field_names = set()
85
+ for field_name, field in cls.model_fields.items():
86
+ field_names.add(field_name)
87
+ if alias := getattr(field, "alias", None):
88
+ field_names.add(alias)
89
+ cls.field_names = field_names
90
+
91
+ # Compare against both field names and aliases
92
+ if any(k not in field_names for k in data):
93
+ logger.warning(
94
+ "The following fields were present in the request "
95
+ "but ignored: %s",
96
+ data.keys() - field_names,
97
+ )
98
+ return result
99
+
100
+
101
+ class ErrorInfo(OpenAIBaseModel):
102
+ message: str
103
+ type: str
104
+ param: Optional[str] = None
105
+ code: int
106
+
107
+
108
+ class ErrorResponse(OpenAIBaseModel):
109
+ error: ErrorInfo
110
+
111
+
112
+ class ModelPermission(OpenAIBaseModel):
113
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
114
+ object: str = "model_permission"
115
+ created: int = Field(default_factory=lambda: int(time.time()))
116
+ allow_create_engine: bool = False
117
+ allow_sampling: bool = True
118
+ allow_logprobs: bool = True
119
+ allow_search_indices: bool = False
120
+ allow_view: bool = True
121
+ allow_fine_tuning: bool = False
122
+ organization: str = "*"
123
+ group: Optional[str] = None
124
+ is_blocking: bool = False
125
+
126
+
127
+ class ModelCard(OpenAIBaseModel):
128
+ id: str
129
+ object: str = "model"
130
+ created: int = Field(default_factory=lambda: int(time.time()))
131
+ owned_by: str = "vllm"
132
+ root: Optional[str] = None
133
+ parent: Optional[str] = None
134
+ max_model_len: Optional[int] = None
135
+ permission: list[ModelPermission] = Field(default_factory=list)
136
+
137
+
138
+ class ModelList(OpenAIBaseModel):
139
+ object: str = "list"
140
+ data: list[ModelCard] = Field(default_factory=list)
141
+
142
+
143
+ class PromptTokenUsageInfo(OpenAIBaseModel):
144
+ cached_tokens: Optional[int] = None
145
+
146
+
147
+ class UsageInfo(OpenAIBaseModel):
148
+ prompt_tokens: int = 0
149
+ total_tokens: int = 0
150
+ completion_tokens: Optional[int] = 0
151
+ prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
152
+
153
+
154
+ class RequestResponseMetadata(BaseModel):
155
+ request_id: str
156
+ final_usage_info: Optional[UsageInfo] = None
157
+
158
+
159
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
160
+ name: str
161
+ description: Optional[str] = None
162
+ # schema is the field in openai but that causes conflicts with pydantic so
163
+ # instead use json_schema with an alias
164
+ json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
165
+ strict: Optional[bool] = None
166
+
167
+
168
+ class StructuralTag(OpenAIBaseModel):
169
+ begin: str
170
+ # schema is the field, but that causes conflicts with pydantic so
171
+ # instead use structural_tag_schema with an alias
172
+ structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
173
+ alias="schema")
174
+ end: str
175
+
176
+
177
+ class StructuralTagResponseFormat(OpenAIBaseModel):
178
+ type: Literal["structural_tag"]
179
+ structures: list[StructuralTag]
180
+ triggers: list[str]
181
+
182
+
183
+ class ResponseFormat(OpenAIBaseModel):
184
+ # type must be "json_schema", "json_object", or "text"
185
+ type: Literal["text", "json_object", "json_schema"]
186
+ json_schema: Optional[JsonSchemaResponseFormat] = None
187
+
188
+
189
+ AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
190
+
191
+
192
+ class StreamOptions(OpenAIBaseModel):
193
+ include_usage: Optional[bool] = True
194
+ continuous_usage_stats: Optional[bool] = False
195
+
196
+
197
+ class FunctionDefinition(OpenAIBaseModel):
198
+ name: str
199
+ description: Optional[str] = None
200
+ parameters: Optional[dict[str, Any]] = None
201
+
202
+
203
+ class ChatCompletionToolsParam(OpenAIBaseModel):
204
+ type: Literal["function"] = "function"
205
+ function: FunctionDefinition
206
+
207
+
208
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
209
+ name: str
210
+
211
+
212
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
213
+ function: ChatCompletionNamedFunction
214
+ type: Literal["function"] = "function"
215
+
216
+
217
+ # extra="forbid" is a workaround to have kwargs as a field,
218
+ # see https://github.com/pydantic/pydantic/issues/3125
219
+ class LogitsProcessorConstructor(BaseModel):
220
+ qualname: str
221
+ args: Optional[list[Any]] = None
222
+ kwargs: Optional[dict[str, Any]] = None
223
+
224
+ model_config = ConfigDict(extra="forbid")
225
+
226
+
227
+ LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
228
+
229
+
230
+ def get_logits_processors(processors: Optional[LogitsProcessors],
231
+ pattern: Optional[str]) -> Optional[list[Any]]:
232
+ if processors and pattern:
233
+ logits_processors = []
234
+ for processor in processors:
235
+ qualname = processor if isinstance(processor,
236
+ str) else processor.qualname
237
+ if not re.match(pattern, qualname):
238
+ raise ValueError(
239
+ f"Logits processor '{qualname}' is not allowed by this "
240
+ "server. See --logits-processor-pattern engine argument "
241
+ "for more information.")
242
+ try:
243
+ logits_processor = resolve_obj_by_qualname(qualname)
244
+ except Exception as e:
245
+ raise ValueError(
246
+ f"Logits processor '{qualname}' could not be resolved: {e}"
247
+ ) from e
248
+ if isinstance(processor, LogitsProcessorConstructor):
249
+ logits_processor = logits_processor(*processor.args or [],
250
+ **processor.kwargs or {})
251
+ logits_processors.append(logits_processor)
252
+ return logits_processors
253
+ elif processors:
254
+ raise ValueError(
255
+ "The `logits_processors` argument is not supported by this "
256
+ "server. See --logits-processor-pattern engine argument "
257
+ "for more information.")
258
+ return None
259
+
260
+
261
+ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
262
+ ResponseReasoningItem,
263
+ ResponseFunctionToolCall]
264
+
265
+
266
+ class ResponsesRequest(OpenAIBaseModel):
267
+ # Ordered by official OpenAI API documentation
268
+ # https://platform.openai.com/docs/api-reference/responses/create
269
+ background: Optional[bool] = False
270
+ include: Optional[list[
271
+ Literal[
272
+ "code_interpreter_call.outputs",
273
+ "computer_call_output.output.image_url",
274
+ "file_search_call.results",
275
+ "message.input_image.image_url",
276
+ "message.output_text.logprobs",
277
+ "reasoning.encrypted_content",
278
+ ],
279
+ ]] = None
280
+ input: Union[str, list[ResponseInputOutputItem]]
281
+ instructions: Optional[str] = None
282
+ max_output_tokens: Optional[int] = None
283
+ max_tool_calls: Optional[int] = None
284
+ metadata: Optional[Metadata] = None
285
+ model: Optional[str] = None
286
+ parallel_tool_calls: Optional[bool] = True
287
+ previous_response_id: Optional[str] = None
288
+ prompt: Optional[ResponsePrompt] = None
289
+ reasoning: Optional[Reasoning] = None
290
+ service_tier: Literal["auto", "default", "flex", "scale",
291
+ "priority"] = "auto"
292
+ store: Optional[bool] = True
293
+ stream: Optional[bool] = False
294
+ temperature: Optional[float] = None
295
+ text: Optional[ResponseTextConfig] = None
296
+ tool_choice: ToolChoice = "auto"
297
+ tools: list[Tool] = Field(default_factory=list)
298
+ top_logprobs: Optional[int] = 0
299
+ top_p: Optional[float] = None
300
+ truncation: Optional[Literal["auto", "disabled"]] = "disabled"
301
+ user: Optional[str] = None
302
+
303
+ # --8<-- [start:responses-extra-params]
304
+ request_id: str = Field(
305
+ default_factory=lambda: f"resp_{random_uuid()}",
306
+ description=(
307
+ "The request_id related to this request. If the caller does "
308
+ "not set it, a random_uuid will be generated. This id is used "
309
+ "through out the inference process and return in response."),
310
+ )
311
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
312
+ default=None,
313
+ description=("Additional kwargs to pass to the HF processor."),
314
+ )
315
+ priority: int = Field(
316
+ default=0,
317
+ description=(
318
+ "The priority of the request (lower means earlier handling; "
319
+ "default: 0). Any priority other than 0 will raise an error "
320
+ "if the served model does not use priority scheduling."),
321
+ )
322
+ cache_salt: Optional[str] = Field(
323
+ default=None,
324
+ description=(
325
+ "If specified, the prefix cache will be salted with the provided "
326
+ "string to prevent an attacker to guess prompts in multi-user "
327
+ "environments. The salt should be random, protected from "
328
+ "access by 3rd parties, and long enough to be "
329
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
330
+ "to 256 bit). Not supported by vLLM engine V0."))
331
+
332
+ enable_response_messages: bool = Field(
333
+ default=False,
334
+ description=(
335
+ "Dictates whether or not to return messages as part of the "
336
+ "response object. Currently only supported for non-streaming "
337
+ "non-background and gpt-oss only. "))
338
+ # --8<-- [end:responses-extra-params]
339
+
340
+ _DEFAULT_SAMPLING_PARAMS = {
341
+ "temperature": 1.0,
342
+ "top_p": 1.0,
343
+ }
344
+
345
+ def to_sampling_params(
346
+ self,
347
+ default_max_tokens: int,
348
+ default_sampling_params: Optional[dict] = None,
349
+ ) -> SamplingParams:
350
+ if self.max_output_tokens is None:
351
+ max_tokens = default_max_tokens
352
+ else:
353
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
354
+
355
+ default_sampling_params = default_sampling_params or {}
356
+ if (temperature := self.temperature) is None:
357
+ temperature = default_sampling_params.get(
358
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
359
+ if (top_p := self.top_p) is None:
360
+ top_p = default_sampling_params.get(
361
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
362
+ stop_token_ids = default_sampling_params.get("stop_token_ids")
363
+
364
+ # Structured output
365
+ structured_outputs = None
366
+ if self.text is not None and self.text.format is not None:
367
+ response_format = self.text.format
368
+ if (response_format.type == "json_schema"
369
+ and response_format.schema_ is not None):
370
+ structured_outputs = StructuredOutputsParams(
371
+ json=response_format.schema_)
372
+ elif response_format.type == "json_object":
373
+ raise NotImplementedError("json_object is not supported")
374
+
375
+ # TODO: add more parameters
376
+ return SamplingParams.from_optional(
377
+ temperature=temperature,
378
+ top_p=top_p,
379
+ max_tokens=max_tokens,
380
+ logprobs=self.top_logprobs
381
+ if self.is_include_output_logprobs() else None,
382
+ stop_token_ids=stop_token_ids,
383
+ output_kind=(RequestOutputKind.DELTA
384
+ if self.stream else RequestOutputKind.FINAL_ONLY),
385
+ structured_outputs=structured_outputs,
386
+ )
387
+
388
+ def is_include_output_logprobs(self) -> bool:
389
+ """Check if the request includes output logprobs."""
390
+ if self.include is None:
391
+ return False
392
+ return isinstance(
393
+ self.include,
394
+ list) and "message.output_text.logprobs" in self.include
395
+
396
+ @model_validator(mode="before")
397
+ def validate_background(cls, data):
398
+ if not data.get("background"):
399
+ return data
400
+ if not data.get("store", True):
401
+ raise ValueError(
402
+ "background can only be used when `store` is true")
403
+ return data
404
+
405
+ @model_validator(mode="before")
406
+ def validate_prompt(cls, data):
407
+ if data.get("prompt") is not None:
408
+ raise ValueError("prompt template is not supported")
409
+ return data
410
+
411
+ @model_validator(mode="before")
412
+ def check_cache_salt_support(cls, data):
413
+ if data.get("cache_salt") is not None:
414
+ if not envs.VLLM_USE_V1:
415
+ raise ValueError(
416
+ "Parameter 'cache_salt' is not supported with "
417
+ "this instance of vLLM, which uses engine V0.")
418
+ if not isinstance(data["cache_salt"],
419
+ str) or not data["cache_salt"]:
420
+ raise ValueError("Parameter 'cache_salt' must be a "
421
+ "non-empty string if provided.")
422
+ return data
423
+
424
+
425
+ class ChatCompletionRequest(OpenAIBaseModel):
426
+ # Ordered by official OpenAI API documentation
427
+ # https://platform.openai.com/docs/api-reference/chat/create
428
+ messages: list[ChatCompletionMessageParam]
429
+ model: Optional[str] = None
430
+ frequency_penalty: Optional[float] = 0.0
431
+ logit_bias: Optional[dict[str, float]] = None
432
+ logprobs: Optional[bool] = False
433
+ top_logprobs: Optional[int] = 0
434
+ max_tokens: Optional[int] = Field(
435
+ default=None,
436
+ deprecated=
437
+ 'max_tokens is deprecated in favor of the max_completion_tokens field')
438
+ max_completion_tokens: Optional[int] = None
439
+ n: Optional[int] = 1
440
+ presence_penalty: Optional[float] = 0.0
441
+ response_format: Optional[AnyResponseFormat] = None
442
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
443
+ stop: Optional[Union[str, list[str]]] = []
444
+ stream: Optional[bool] = False
445
+ stream_options: Optional[StreamOptions] = None
446
+ temperature: Optional[float] = None
447
+ top_p: Optional[float] = None
448
+ tools: Optional[list[ChatCompletionToolsParam]] = None
449
+ tool_choice: Optional[Union[
450
+ Literal["none"],
451
+ Literal["auto"],
452
+ Literal["required"],
453
+ ChatCompletionNamedToolChoiceParam,
454
+ ]] = "none"
455
+ reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
456
+ include_reasoning: bool = True
457
+
458
+ # NOTE this will be ignored by vLLM -- the model determines the behavior
459
+ parallel_tool_calls: Optional[bool] = False
460
+ user: Optional[str] = None
461
+
462
+ # --8<-- [start:chat-completion-sampling-params]
463
+ best_of: Optional[int] = None
464
+ use_beam_search: bool = False
465
+ top_k: Optional[int] = None
466
+ min_p: Optional[float] = None
467
+ repetition_penalty: Optional[float] = None
468
+ length_penalty: float = 1.0
469
+ stop_token_ids: Optional[list[int]] = []
470
+ include_stop_str_in_output: bool = False
471
+ ignore_eos: bool = False
472
+ min_tokens: int = 0
473
+ skip_special_tokens: bool = True
474
+ spaces_between_special_tokens: bool = True
475
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
476
+ prompt_logprobs: Optional[int] = None
477
+ allowed_token_ids: Optional[list[int]] = None
478
+ bad_words: list[str] = Field(default_factory=list)
479
+ # --8<-- [end:chat-completion-sampling-params]
480
+
481
+ # --8<-- [start:chat-completion-extra-params]
482
+ echo: bool = Field(
483
+ default=False,
484
+ description=(
485
+ "If true, the new message will be prepended with the last message "
486
+ "if they belong to the same role."),
487
+ )
488
+ add_generation_prompt: bool = Field(
489
+ default=True,
490
+ description=
491
+ ("If true, the generation prompt will be added to the chat template. "
492
+ "This is a parameter used by chat template in tokenizer config of the "
493
+ "model."),
494
+ )
495
+ continue_final_message: bool = Field(
496
+ default=False,
497
+ description=
498
+ ("If this is set, the chat will be formatted so that the final "
499
+ "message in the chat is open-ended, without any EOS tokens. The "
500
+ "model will continue this message rather than starting a new one. "
501
+ "This allows you to \"prefill\" part of the model's response for it. "
502
+ "Cannot be used at the same time as `add_generation_prompt`."),
503
+ )
504
+ add_special_tokens: bool = Field(
505
+ default=False,
506
+ description=(
507
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
508
+ "on top of what is added by the chat template. "
509
+ "For most models, the chat template takes care of adding the "
510
+ "special tokens so this should be set to false (as is the "
511
+ "default)."),
512
+ )
513
+ documents: Optional[list[dict[str, str]]] = Field(
514
+ default=None,
515
+ description=
516
+ ("A list of dicts representing documents that will be accessible to "
517
+ "the model if it is performing RAG (retrieval-augmented generation)."
518
+ " If the template does not support RAG, this argument will have no "
519
+ "effect. We recommend that each document should be a dict containing "
520
+ "\"title\" and \"text\" keys."),
521
+ )
522
+ chat_template: Optional[str] = Field(
523
+ default=None,
524
+ description=(
525
+ "A Jinja template to use for this conversion. "
526
+ "As of transformers v4.44, default chat template is no longer "
527
+ "allowed, so you must provide a chat template if the tokenizer "
528
+ "does not define one."),
529
+ )
530
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
531
+ default=None,
532
+ description=(
533
+ "Additional keyword args to pass to the template renderer. "
534
+ "Will be accessible by the chat template."),
535
+ )
536
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
537
+ default=None,
538
+ description=("Additional kwargs to pass to the HF processor."),
539
+ )
540
+ structured_outputs: Optional[StructuredOutputsParams] = Field(
541
+ default=None,
542
+ description="Additional kwargs for structured outputs",
543
+ )
544
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
545
+ default=None,
546
+ description=(
547
+ "`guided_json` is deprecated. "
548
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
549
+ "Please pass `json` to `structured_outputs` instead."),
550
+ )
551
+ guided_regex: Optional[str] = Field(
552
+ default=None,
553
+ description=(
554
+ "`guided_regex` is deprecated. "
555
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
556
+ "Please pass `regex` to `structured_outputs` instead."),
557
+ )
558
+ guided_choice: Optional[list[str]] = Field(
559
+ default=None,
560
+ description=(
561
+ "`guided_choice` is deprecated. "
562
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
563
+ "Please pass `choice` to `structured_outputs` instead."),
564
+ )
565
+ guided_grammar: Optional[str] = Field(
566
+ default=None,
567
+ description=(
568
+ "`guided_grammar` is deprecated. "
569
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
570
+ "Please pass `grammar` to `structured_outputs` instead."),
571
+ )
572
+ structural_tag: Optional[str] = Field(
573
+ default=None,
574
+ description=(
575
+ "`structural_tag` is deprecated. "
576
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
577
+ "Please pass `structural_tag` to `structured_outputs` instead."),
578
+ )
579
+ guided_decoding_backend: Optional[str] = Field(
580
+ default=None,
581
+ description=(
582
+ "`guided_decoding_backend` is deprecated. "
583
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
584
+ "Please remove it from your request."),
585
+ )
586
+ guided_whitespace_pattern: Optional[str] = Field(
587
+ default=None,
588
+ description=(
589
+ "`guided_whitespace_pattern` is deprecated. "
590
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
591
+ "Please pass `whitespace_pattern` to `structured_outputs` instead."
592
+ ),
593
+ )
594
+ priority: int = Field(
595
+ default=0,
596
+ description=(
597
+ "The priority of the request (lower means earlier handling; "
598
+ "default: 0). Any priority other than 0 will raise an error "
599
+ "if the served model does not use priority scheduling."),
600
+ )
601
+ request_id: str = Field(
602
+ default_factory=lambda: f"{random_uuid()}",
603
+ description=(
604
+ "The request_id related to this request. If the caller does "
605
+ "not set it, a random_uuid will be generated. This id is used "
606
+ "through out the inference process and return in response."),
607
+ )
608
+ logits_processors: Optional[LogitsProcessors] = Field(
609
+ default=None,
610
+ description=(
611
+ "A list of either qualified names of logits processors, or "
612
+ "constructor objects, to apply when sampling. A constructor is "
613
+ "a JSON object with a required 'qualname' field specifying the "
614
+ "qualified name of the processor class/factory, and optional "
615
+ "'args' and 'kwargs' fields containing positional and keyword "
616
+ "arguments. For example: {'qualname': "
617
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
618
+ "{'param': 'value'}}."))
619
+ return_tokens_as_token_ids: Optional[bool] = Field(
620
+ default=None,
621
+ description=(
622
+ "If specified with 'logprobs', tokens are represented "
623
+ " as strings of the form 'token_id:{token_id}' so that tokens "
624
+ "that are not JSON-encodable can be identified."))
625
+ return_token_ids: Optional[bool] = Field(
626
+ default=None,
627
+ description=(
628
+ "If specified, the result will include token IDs alongside the "
629
+ "generated text. In streaming mode, prompt_token_ids is included "
630
+ "only in the first chunk, and token_ids contains the delta tokens "
631
+ "for each chunk. This is useful for debugging or when you "
632
+ "need to map generated text back to input tokens."))
633
+ cache_salt: Optional[str] = Field(
634
+ default=None,
635
+ description=(
636
+ "If specified, the prefix cache will be salted with the provided "
637
+ "string to prevent an attacker to guess prompts in multi-user "
638
+ "environments. The salt should be random, protected from "
639
+ "access by 3rd parties, and long enough to be "
640
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
641
+ "to 256 bit). Not supported by vLLM engine V0."))
642
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
643
+ default=None,
644
+ description="KVTransfer parameters used for disaggregated serving.")
645
+
646
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
647
+ default=None,
648
+ description=("Additional request parameters with string or "
649
+ "numeric values, used by custom extensions."),
650
+ )
651
+
652
+ # --8<-- [end:chat-completion-extra-params]
653
+
654
+ # Default sampling parameters for chat completion requests
655
+ _DEFAULT_SAMPLING_PARAMS: dict = {
656
+ "repetition_penalty": 1.0,
657
+ "temperature": 1.0,
658
+ "top_p": 1.0,
659
+ "top_k": 0,
660
+ "min_p": 0.0,
661
+ }
662
+
663
+ def to_beam_search_params(
664
+ self, max_tokens: int,
665
+ default_sampling_params: dict) -> BeamSearchParams:
666
+
667
+ n = self.n if self.n is not None else 1
668
+ if (temperature := self.temperature) is None:
669
+ temperature = default_sampling_params.get(
670
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
671
+
672
+ return BeamSearchParams(
673
+ beam_width=n,
674
+ max_tokens=max_tokens,
675
+ ignore_eos=self.ignore_eos,
676
+ temperature=temperature,
677
+ length_penalty=self.length_penalty,
678
+ include_stop_str_in_output=self.include_stop_str_in_output,
679
+ )
680
+
681
+ def to_sampling_params(
682
+ self,
683
+ max_tokens: int,
684
+ logits_processor_pattern: Optional[str],
685
+ default_sampling_params: dict,
686
+ ) -> SamplingParams:
687
+
688
+ # Default parameters
689
+ if (repetition_penalty := self.repetition_penalty) is None:
690
+ repetition_penalty = default_sampling_params.get(
691
+ "repetition_penalty",
692
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
693
+ )
694
+ if (temperature := self.temperature) is None:
695
+ temperature = default_sampling_params.get(
696
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
697
+ if (top_p := self.top_p) is None:
698
+ top_p = default_sampling_params.get(
699
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
700
+ if (top_k := self.top_k) is None:
701
+ top_k = default_sampling_params.get(
702
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
703
+ if (min_p := self.min_p) is None:
704
+ min_p = default_sampling_params.get(
705
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
706
+
707
+ prompt_logprobs = self.prompt_logprobs
708
+ if prompt_logprobs is None and self.echo:
709
+ prompt_logprobs = self.top_logprobs
710
+
711
+ # Forward deprecated guided_* parameters to structured_outputs
712
+ if self.structured_outputs is None:
713
+ kwargs = dict[str, Any](
714
+ json=self.guided_json,
715
+ regex=self.guided_regex,
716
+ choice=self.guided_choice,
717
+ grammar=self.guided_grammar,
718
+ whitespace_pattern=self.guided_whitespace_pattern,
719
+ structural_tag=self.structural_tag,
720
+ )
721
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
722
+ if len(kwargs) > 0:
723
+ self.structured_outputs = StructuredOutputsParams(**kwargs)
724
+
725
+ response_format = self.response_format
726
+ json_schema_from_tool = self._get_json_schema_from_tool()
727
+ if response_format is not None or json_schema_from_tool is not None:
728
+ # If structured outputs wasn't already enabled,
729
+ # we must enable it for these features to work
730
+ if self.structured_outputs is None:
731
+ self.structured_outputs = StructuredOutputsParams()
732
+
733
+ # Set structured output params for response format
734
+ if response_format is not None:
735
+ if response_format.type == "json_object":
736
+ self.structured_outputs.json_object = True
737
+ elif response_format.type == "json_schema":
738
+ json_schema = response_format.json_schema
739
+ assert json_schema is not None
740
+ self.structured_outputs.json = json_schema.json_schema
741
+ elif response_format.type == "structural_tag":
742
+ structural_tag = response_format
743
+ assert structural_tag is not None and isinstance(
744
+ structural_tag, StructuralTagResponseFormat)
745
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
746
+ self.structured_outputs.structural_tag = json.dumps(
747
+ s_tag_obj)
748
+
749
+ # Set structured output params for tool calling
750
+ if json_schema_from_tool is not None:
751
+ self.structured_outputs.json = json_schema_from_tool
752
+
753
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
754
+ if self.kv_transfer_params:
755
+ # Pass in kv_transfer_params via extra_args
756
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
757
+ return SamplingParams.from_optional(
758
+ n=self.n,
759
+ best_of=self.best_of,
760
+ presence_penalty=self.presence_penalty,
761
+ frequency_penalty=self.frequency_penalty,
762
+ repetition_penalty=repetition_penalty,
763
+ temperature=temperature,
764
+ top_p=top_p,
765
+ top_k=top_k,
766
+ min_p=min_p,
767
+ seed=self.seed,
768
+ stop=self.stop,
769
+ stop_token_ids=self.stop_token_ids,
770
+ logprobs=self.top_logprobs if self.logprobs else None,
771
+ prompt_logprobs=prompt_logprobs,
772
+ ignore_eos=self.ignore_eos,
773
+ max_tokens=max_tokens,
774
+ min_tokens=self.min_tokens,
775
+ skip_special_tokens=self.skip_special_tokens,
776
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
777
+ logits_processors=get_logits_processors(self.logits_processors,
778
+ logits_processor_pattern),
779
+ include_stop_str_in_output=self.include_stop_str_in_output,
780
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
781
+ output_kind=RequestOutputKind.DELTA if self.stream \
782
+ else RequestOutputKind.FINAL_ONLY,
783
+ structured_outputs=self.structured_outputs,
784
+ logit_bias=self.logit_bias,
785
+ bad_words=self.bad_words,
786
+ allowed_token_ids=self.allowed_token_ids,
787
+ extra_args=extra_args or None,
788
+ )
789
+
790
+ def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]:
791
+ # user has chosen to not use any tool
792
+ if self.tool_choice == "none" or self.tools is None:
793
+ return None
794
+
795
+ # user has chosen to use a named tool
796
+ if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
797
+ tool_name = self.tool_choice.function.name
798
+ tools = {tool.function.name: tool.function for tool in self.tools}
799
+ if tool_name not in tools:
800
+ raise ValueError(
801
+ f"Tool '{tool_name}' has not been passed in `tools`.")
802
+ tool = tools[tool_name]
803
+ return tool.parameters
804
+
805
+ if self.tool_choice == "required":
806
+ # Pydantic schema generation cannot be used since the JSON schema
807
+ # has to be constructed for a specific instantiation of a tool list
808
+ # so that parameters of a function are correctly generated
809
+ # based on the chosen function name
810
+ def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
811
+ return {
812
+ "properties": {
813
+ "name": {
814
+ "type": "string",
815
+ "enum": [tool.function.name]
816
+ },
817
+ # parameters are always generated as '{}' in the final
818
+ # output if they are missing from the request
819
+ # (i.e. are None or '{}') so the schema is
820
+ # updated to produce an empty object in that case
821
+ "parameters": tool.function.parameters
822
+ if tool.function.parameters else {
823
+ "type": "object",
824
+ "properties": {}
825
+ }
826
+ },
827
+ "required": ["name", "parameters"]
828
+ }
829
+
830
+ def get_tool_schema_defs(
831
+ tools: list[ChatCompletionToolsParam]) -> dict:
832
+ all_defs = dict[str, dict[str, Any]]()
833
+ for tool in tools:
834
+ if tool.function.parameters is None:
835
+ continue
836
+ defs = tool.function.parameters.pop("$defs", {})
837
+ for def_name, def_schema in defs.items():
838
+ if def_name in all_defs and all_defs[
839
+ def_name] != def_schema:
840
+ raise ValueError(
841
+ f"Tool definition '{def_name}' has "
842
+ "multiple schemas, which is not "
843
+ "supported.")
844
+ else:
845
+ all_defs[def_name] = def_schema
846
+ return all_defs
847
+
848
+ json_schema = {
849
+ "type": "array",
850
+ "minItems": 1,
851
+ "items": {
852
+ "type": "object",
853
+ "anyOf": [get_tool_schema(tool) for tool in self.tools]
854
+ }
855
+ }
856
+ json_schema_defs = get_tool_schema_defs(self.tools)
857
+ if json_schema_defs:
858
+ json_schema["$defs"] = json_schema_defs
859
+ return json_schema
860
+
861
+ return None
862
+
863
+ @model_validator(mode="before")
864
+ @classmethod
865
+ def validate_stream_options(cls, data):
866
+ if data.get("stream_options") and not data.get("stream"):
867
+ raise ValueError(
868
+ "Stream options can only be defined when `stream=True`.")
869
+
870
+ return data
871
+
872
+ @model_validator(mode="before")
873
+ @classmethod
874
+ def check_logprobs(cls, data):
875
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
876
+ if data.get("stream") and (prompt_logprobs > 0
877
+ or prompt_logprobs == -1):
878
+ raise ValueError(
879
+ "`prompt_logprobs` are not available when `stream=True`.")
880
+
881
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
882
+ raise ValueError(
883
+ "`prompt_logprobs` must be a positive value or -1.")
884
+ if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
885
+ raise ValueError("`prompt_logprobs=-1` is only supported with "
886
+ "vLLM engine V1.")
887
+ if (top_logprobs := data.get("top_logprobs")) is not None:
888
+ if top_logprobs < 0 and top_logprobs != -1:
889
+ raise ValueError(
890
+ "`top_logprobs` must be a positive value or -1.")
891
+
892
+ if (top_logprobs == -1
893
+ or top_logprobs > 0) and not data.get("logprobs"):
894
+ raise ValueError(
895
+ "when using `top_logprobs`, `logprobs` must be set to true."
896
+ )
897
+
898
+ return data
899
+
900
+ @model_validator(mode="before")
901
+ @classmethod
902
+ def check_structured_outputs_count(cls, data):
903
+ if isinstance(data, ValueError):
904
+ raise data
905
+
906
+ if data.get("structured_outputs", None) is None:
907
+ return data
908
+
909
+ structured_outputs_kwargs = data['structured_outputs']
910
+ count = sum(
911
+ structured_outputs_kwargs.get(k) is not None
912
+ for k in ("json", "regex", "choice"))
913
+ # you can only use one kind of constraints for structured outputs
914
+ if count > 1:
915
+ raise ValueError(
916
+ "You can only use one kind of constraints for structured "
917
+ "outputs ('json', 'regex' or 'choice').")
918
+ # you can only either use structured outputs or tools, not both
919
+ if count > 1 and data.get("tool_choice", "none") not in (
920
+ "none",
921
+ "auto",
922
+ "required",
923
+ ):
924
+ raise ValueError(
925
+ "You can only either use constraints for structured outputs "
926
+ "or tools, not both.")
927
+ return data
928
+
929
+ @model_validator(mode="before")
930
+ @classmethod
931
+ def check_tool_usage(cls, data):
932
+
933
+ # if "tool_choice" is not specified but tools are provided,
934
+ # default to "auto" tool_choice
935
+ if "tool_choice" not in data and data.get("tools"):
936
+ data["tool_choice"] = "auto"
937
+
938
+ # if "tool_choice" is "none" -- no validation is needed for tools
939
+ if "tool_choice" in data and data["tool_choice"] == "none":
940
+ return data
941
+
942
+ # if "tool_choice" is specified -- validation
943
+ if "tool_choice" in data and data["tool_choice"] is not None:
944
+
945
+ # ensure that if "tool choice" is specified, tools are present
946
+ if "tools" not in data or data["tools"] is None:
947
+ raise ValueError(
948
+ "When using `tool_choice`, `tools` must be set.")
949
+
950
+ # make sure that tool choice is either a named tool
951
+ # OR that it's set to "auto" or "required"
952
+ if data["tool_choice"] not in [
953
+ "auto", "required"
954
+ ] and not isinstance(data["tool_choice"], dict):
955
+ raise ValueError(
956
+ f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
957
+ 'Only named tools, "none", "auto" or "required" '\
958
+ 'are supported.'
959
+ )
960
+
961
+ # if tool_choice is "required" but the "tools" list is empty,
962
+ # override the data to behave like "none" to align with
963
+ # OpenAI’s behavior.
964
+ if data["tool_choice"] == "required" and isinstance(
965
+ data["tools"], list) and len(data["tools"]) == 0:
966
+ data["tool_choice"] = "none"
967
+ del data["tools"]
968
+ return data
969
+
970
+ # ensure that if "tool_choice" is specified as an object,
971
+ # it matches a valid tool
972
+ correct_usage_message = 'Correct usage: `{"type": "function",' \
973
+ ' "function": {"name": "my_function"}}`'
974
+ if isinstance(data["tool_choice"], dict):
975
+ valid_tool = False
976
+ function = data["tool_choice"].get("function")
977
+ if not isinstance(function, dict):
978
+ raise ValueError(
979
+ f"Invalid value for `function`: `{function}` in "
980
+ f"`tool_choice`! {correct_usage_message}")
981
+ if "name" not in function:
982
+ raise ValueError(f"Expected field `name` in `function` in "
983
+ f"`tool_choice`! {correct_usage_message}")
984
+ function_name = function["name"]
985
+ if not isinstance(function_name,
986
+ str) or len(function_name) == 0:
987
+ raise ValueError(
988
+ f"Invalid `name` in `function`: `{function_name}`"
989
+ f" in `tool_choice`! {correct_usage_message}")
990
+ for tool in data["tools"]:
991
+ if tool["function"]["name"] == function_name:
992
+ valid_tool = True
993
+ break
994
+ if not valid_tool:
995
+ raise ValueError(
996
+ "The tool specified in `tool_choice` does not match any"
997
+ " of the specified `tools`")
998
+ return data
999
+
1000
+ @model_validator(mode="before")
1001
+ @classmethod
1002
+ def check_generation_prompt(cls, data):
1003
+ if data.get("continue_final_message") and data.get(
1004
+ "add_generation_prompt"):
1005
+ raise ValueError("Cannot set both `continue_final_message` and "
1006
+ "`add_generation_prompt` to True.")
1007
+ return data
1008
+
1009
+ @model_validator(mode="before")
1010
+ @classmethod
1011
+ def check_cache_salt_support(cls, data):
1012
+ if data.get("cache_salt") is not None:
1013
+ if not envs.VLLM_USE_V1:
1014
+ raise ValueError(
1015
+ "Parameter 'cache_salt' is not supported with "
1016
+ "this instance of vLLM, which uses engine V0.")
1017
+ if not isinstance(data["cache_salt"],
1018
+ str) or not data["cache_salt"]:
1019
+ raise ValueError("Parameter 'cache_salt' must be a "
1020
+ "non-empty string if provided.")
1021
+ return data
1022
+
1023
+
1024
+ class CompletionRequest(OpenAIBaseModel):
1025
+ # Ordered by official OpenAI API documentation
1026
+ # https://platform.openai.com/docs/api-reference/completions/create
1027
+ model: Optional[str] = None
1028
+ prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
1029
+ best_of: Optional[int] = None
1030
+ echo: Optional[bool] = False
1031
+ frequency_penalty: Optional[float] = 0.0
1032
+ logit_bias: Optional[dict[str, float]] = None
1033
+ logprobs: Optional[int] = None
1034
+ max_tokens: Optional[int] = 16
1035
+ n: int = 1
1036
+ presence_penalty: Optional[float] = 0.0
1037
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1038
+ stop: Optional[Union[str, list[str]]] = []
1039
+ stream: Optional[bool] = False
1040
+ stream_options: Optional[StreamOptions] = None
1041
+ suffix: Optional[str] = None
1042
+ temperature: Optional[float] = None
1043
+ top_p: Optional[float] = None
1044
+ user: Optional[str] = None
1045
+
1046
+ # --8<-- [start:completion-sampling-params]
1047
+ use_beam_search: bool = False
1048
+ top_k: Optional[int] = None
1049
+ min_p: Optional[float] = None
1050
+ repetition_penalty: Optional[float] = None
1051
+ length_penalty: float = 1.0
1052
+ stop_token_ids: Optional[list[int]] = []
1053
+ include_stop_str_in_output: bool = False
1054
+ ignore_eos: bool = False
1055
+ min_tokens: int = 0
1056
+ skip_special_tokens: bool = True
1057
+ spaces_between_special_tokens: bool = True
1058
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1059
+ allowed_token_ids: Optional[list[int]] = None
1060
+ prompt_logprobs: Optional[int] = None
1061
+ # --8<-- [end:completion-sampling-params]
1062
+
1063
+ # --8<-- [start:completion-extra-params]
1064
+ prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
1065
+ add_special_tokens: bool = Field(
1066
+ default=True,
1067
+ description=(
1068
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1069
+ "the prompt."),
1070
+ )
1071
+ response_format: Optional[AnyResponseFormat] = Field(
1072
+ default=None,
1073
+ description=(
1074
+ "Similar to chat completion, this parameter specifies the format "
1075
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
1076
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
1077
+ ),
1078
+ )
1079
+ structured_outputs: Optional[StructuredOutputsParams] = Field(
1080
+ default=None,
1081
+ description="Additional kwargs for structured outputs",
1082
+ )
1083
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
1084
+ default=None,
1085
+ description=(
1086
+ "`guided_json` is deprecated. "
1087
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1088
+ "Please pass `json` to `structured_outputs` instead."),
1089
+ )
1090
+ guided_regex: Optional[str] = Field(
1091
+ default=None,
1092
+ description=(
1093
+ "`guided_regex` is deprecated. "
1094
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1095
+ "Please pass `regex` to `structured_outputs` instead."),
1096
+ )
1097
+ guided_choice: Optional[list[str]] = Field(
1098
+ default=None,
1099
+ description=(
1100
+ "`guided_choice` is deprecated. "
1101
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1102
+ "Please pass `choice` to `structured_outputs` instead."),
1103
+ )
1104
+ guided_grammar: Optional[str] = Field(
1105
+ default=None,
1106
+ description=(
1107
+ "`guided_grammar` is deprecated. "
1108
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1109
+ "Please pass `grammar` to `structured_outputs` instead."),
1110
+ )
1111
+ guided_decoding_backend: Optional[str] = Field(
1112
+ default=None,
1113
+ description=(
1114
+ "`guided_decoding_backend` is deprecated. "
1115
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1116
+ "Please remove it from your request."),
1117
+ )
1118
+ guided_whitespace_pattern: Optional[str] = Field(
1119
+ default=None,
1120
+ description=(
1121
+ "`guided_whitespace_pattern` is deprecated. "
1122
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1123
+ "Please pass `whitespace_pattern` to `structured_outputs` instead."
1124
+ ),
1125
+ )
1126
+ priority: int = Field(
1127
+ default=0,
1128
+ description=(
1129
+ "The priority of the request (lower means earlier handling; "
1130
+ "default: 0). Any priority other than 0 will raise an error "
1131
+ "if the served model does not use priority scheduling."),
1132
+ )
1133
+ request_id: str = Field(
1134
+ default_factory=lambda: f"{random_uuid()}",
1135
+ description=(
1136
+ "The request_id related to this request. If the caller does "
1137
+ "not set it, a random_uuid will be generated. This id is used "
1138
+ "through out the inference process and return in response."),
1139
+ )
1140
+ logits_processors: Optional[LogitsProcessors] = Field(
1141
+ default=None,
1142
+ description=(
1143
+ "A list of either qualified names of logits processors, or "
1144
+ "constructor objects, to apply when sampling. A constructor is "
1145
+ "a JSON object with a required 'qualname' field specifying the "
1146
+ "qualified name of the processor class/factory, and optional "
1147
+ "'args' and 'kwargs' fields containing positional and keyword "
1148
+ "arguments. For example: {'qualname': "
1149
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
1150
+ "{'param': 'value'}}."))
1151
+
1152
+ return_tokens_as_token_ids: Optional[bool] = Field(
1153
+ default=None,
1154
+ description=(
1155
+ "If specified with 'logprobs', tokens are represented "
1156
+ " as strings of the form 'token_id:{token_id}' so that tokens "
1157
+ "that are not JSON-encodable can be identified."))
1158
+ return_token_ids: Optional[bool] = Field(
1159
+ default=None,
1160
+ description=(
1161
+ "If specified, the result will include token IDs alongside the "
1162
+ "generated text. In streaming mode, prompt_token_ids is included "
1163
+ "only in the first chunk, and token_ids contains the delta tokens "
1164
+ "for each chunk. This is useful for debugging or when you "
1165
+ "need to map generated text back to input tokens."))
1166
+
1167
+ cache_salt: Optional[str] = Field(
1168
+ default=None,
1169
+ description=(
1170
+ "If specified, the prefix cache will be salted with the provided "
1171
+ "string to prevent an attacker to guess prompts in multi-user "
1172
+ "environments. The salt should be random, protected from "
1173
+ "access by 3rd parties, and long enough to be "
1174
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
1175
+ "to 256 bit). Not supported by vLLM engine V0."))
1176
+
1177
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1178
+ default=None,
1179
+ description="KVTransfer parameters used for disaggregated serving.")
1180
+
1181
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
1182
+ default=None,
1183
+ description=("Additional request parameters with string or "
1184
+ "numeric values, used by custom extensions."),
1185
+ )
1186
+
1187
+ # --8<-- [end:completion-extra-params]
1188
+
1189
+ # Default sampling parameters for completion requests
1190
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1191
+ "repetition_penalty": 1.0,
1192
+ "temperature": 1.0,
1193
+ "top_p": 1.0,
1194
+ "top_k": 0,
1195
+ "min_p": 0.0,
1196
+ }
1197
+
1198
+ def to_beam_search_params(
1199
+ self,
1200
+ max_tokens: int,
1201
+ default_sampling_params: Optional[dict] = None,
1202
+ ) -> BeamSearchParams:
1203
+
1204
+ if default_sampling_params is None:
1205
+ default_sampling_params = {}
1206
+ n = self.n if self.n is not None else 1
1207
+
1208
+ if (temperature := self.temperature) is None:
1209
+ temperature = default_sampling_params.get("temperature", 1.0)
1210
+
1211
+ return BeamSearchParams(
1212
+ beam_width=n,
1213
+ max_tokens=max_tokens,
1214
+ ignore_eos=self.ignore_eos,
1215
+ temperature=temperature,
1216
+ length_penalty=self.length_penalty,
1217
+ include_stop_str_in_output=self.include_stop_str_in_output,
1218
+ )
1219
+
1220
+ def to_sampling_params(
1221
+ self,
1222
+ max_tokens: int,
1223
+ logits_processor_pattern: Optional[str],
1224
+ default_sampling_params: Optional[dict] = None,
1225
+ ) -> SamplingParams:
1226
+
1227
+ if default_sampling_params is None:
1228
+ default_sampling_params = {}
1229
+
1230
+ # Default parameters
1231
+ if (repetition_penalty := self.repetition_penalty) is None:
1232
+ repetition_penalty = default_sampling_params.get(
1233
+ "repetition_penalty",
1234
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
1235
+ )
1236
+ if (temperature := self.temperature) is None:
1237
+ temperature = default_sampling_params.get(
1238
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
1239
+ if (top_p := self.top_p) is None:
1240
+ top_p = default_sampling_params.get(
1241
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
1242
+ if (top_k := self.top_k) is None:
1243
+ top_k = default_sampling_params.get(
1244
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
1245
+ if (min_p := self.min_p) is None:
1246
+ min_p = default_sampling_params.get(
1247
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
1248
+
1249
+ prompt_logprobs = self.prompt_logprobs
1250
+ if prompt_logprobs is None and self.echo:
1251
+ prompt_logprobs = self.logprobs
1252
+
1253
+ echo_without_generation = self.echo and self.max_tokens == 0
1254
+
1255
+ # Forward deprecated guided_* parameters to structured_outputs
1256
+ if self.structured_outputs is None:
1257
+ kwargs = dict[str, Any](
1258
+ json=self.guided_json,
1259
+ regex=self.guided_regex,
1260
+ choice=self.guided_choice,
1261
+ grammar=self.guided_grammar,
1262
+ whitespace_pattern=self.guided_whitespace_pattern,
1263
+ )
1264
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
1265
+ if len(kwargs) > 0:
1266
+ self.structured_outputs = StructuredOutputsParams(**kwargs)
1267
+
1268
+ if (self.structured_outputs is not None
1269
+ and self.response_format is not None
1270
+ and self.response_format.type == "json_object"):
1271
+ self.structured_outputs.json_object = True
1272
+
1273
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
1274
+ if self.kv_transfer_params:
1275
+ # Pass in kv_transfer_params via extra_args
1276
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
1277
+ return SamplingParams.from_optional(
1278
+ n=self.n,
1279
+ best_of=self.best_of,
1280
+ presence_penalty=self.presence_penalty,
1281
+ frequency_penalty=self.frequency_penalty,
1282
+ repetition_penalty=repetition_penalty,
1283
+ temperature=temperature,
1284
+ top_p=top_p,
1285
+ top_k=top_k,
1286
+ min_p=min_p,
1287
+ seed=self.seed,
1288
+ stop=self.stop,
1289
+ stop_token_ids=self.stop_token_ids,
1290
+ logprobs=self.logprobs,
1291
+ ignore_eos=self.ignore_eos,
1292
+ max_tokens=max_tokens if not echo_without_generation else 1,
1293
+ min_tokens=self.min_tokens,
1294
+ prompt_logprobs=prompt_logprobs,
1295
+ skip_special_tokens=self.skip_special_tokens,
1296
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
1297
+ include_stop_str_in_output=self.include_stop_str_in_output,
1298
+ logits_processors=get_logits_processors(self.logits_processors,
1299
+ logits_processor_pattern),
1300
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1301
+ output_kind=RequestOutputKind.DELTA if self.stream \
1302
+ else RequestOutputKind.FINAL_ONLY,
1303
+ structured_outputs=self.structured_outputs,
1304
+ logit_bias=self.logit_bias,
1305
+ allowed_token_ids=self.allowed_token_ids,
1306
+ extra_args=extra_args or None,
1307
+ )
1308
+
1309
+ @model_validator(mode="before")
1310
+ @classmethod
1311
+ def check_structured_outputs_count(cls, data):
1312
+ if data.get("structured_outputs", None) is None:
1313
+ return data
1314
+
1315
+ structured_outputs_kwargs = data['structured_outputs']
1316
+ count = sum(
1317
+ structured_outputs_kwargs.get(k) is not None
1318
+ for k in ("json", "regex", "choice"))
1319
+ if count > 1:
1320
+ raise ValueError(
1321
+ "You can only use one kind of constraints for structured "
1322
+ "outputs ('json', 'regex' or 'choice').")
1323
+ return data
1324
+
1325
+ @model_validator(mode="before")
1326
+ @classmethod
1327
+ def check_logprobs(cls, data):
1328
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1329
+ if data.get("stream") and (prompt_logprobs > 0
1330
+ or prompt_logprobs == -1):
1331
+ raise ValueError(
1332
+ "`prompt_logprobs` are not available when `stream=True`.")
1333
+
1334
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
1335
+ raise ValueError(
1336
+ "`prompt_logprobs` must be a positive value or -1.")
1337
+ if prompt_logprobs == -1 and not envs.VLLM_USE_V1:
1338
+ raise ValueError("`prompt_logprobs=-1` is only supported with "
1339
+ "vLLM engine V1.")
1340
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1341
+ raise ValueError("`logprobs` must be a positive value.")
1342
+
1343
+ return data
1344
+
1345
+ @model_validator(mode="before")
1346
+ @classmethod
1347
+ def validate_stream_options(cls, data):
1348
+ if data.get("stream_options") and not data.get("stream"):
1349
+ raise ValueError(
1350
+ "Stream options can only be defined when `stream=True`.")
1351
+
1352
+ return data
1353
+
1354
+ @model_validator(mode="before")
1355
+ @classmethod
1356
+ def validate_prompt_and_prompt_embeds(cls, data):
1357
+ prompt = data.get("prompt")
1358
+ prompt_embeds = data.get("prompt_embeds")
1359
+
1360
+ prompt_is_empty = (prompt is None
1361
+ or (isinstance(prompt, str) and prompt == ""))
1362
+ embeds_is_empty = (prompt_embeds is None
1363
+ or (isinstance(prompt_embeds, list)
1364
+ and len(prompt_embeds) == 0))
1365
+
1366
+ if prompt_is_empty and embeds_is_empty:
1367
+ raise ValueError(
1368
+ "Either prompt or prompt_embeds must be provided and non-empty."
1369
+ )
1370
+
1371
+ return data
1372
+
1373
+ @model_validator(mode="before")
1374
+ @classmethod
1375
+ def check_cache_salt_support(cls, data):
1376
+ if data.get("cache_salt") is not None:
1377
+ if not envs.VLLM_USE_V1:
1378
+ raise ValueError(
1379
+ "Parameter 'cache_salt' is not supported with "
1380
+ "this instance of vLLM, which uses engine V0.")
1381
+ if not isinstance(data["cache_salt"],
1382
+ str) or not data["cache_salt"]:
1383
+ raise ValueError("Parameter 'cache_salt' must be a "
1384
+ "non-empty string if provided.")
1385
+ return data
1386
+
1387
+
1388
+ class EmbeddingCompletionRequest(OpenAIBaseModel):
1389
+ # Ordered by official OpenAI API documentation
1390
+ # https://platform.openai.com/docs/api-reference/embeddings
1391
+ model: Optional[str] = None
1392
+ input: Union[list[int], list[list[int]], str, list[str]]
1393
+ encoding_format: Literal["float", "base64"] = "float"
1394
+ dimensions: Optional[int] = None
1395
+ user: Optional[str] = None
1396
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1397
+
1398
+ # --8<-- [start:embedding-extra-params]
1399
+ add_special_tokens: bool = Field(
1400
+ default=True,
1401
+ description=(
1402
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1403
+ "the prompt."),
1404
+ )
1405
+ priority: int = Field(
1406
+ default=0,
1407
+ description=(
1408
+ "The priority of the request (lower means earlier handling; "
1409
+ "default: 0). Any priority other than 0 will raise an error "
1410
+ "if the served model does not use priority scheduling."),
1411
+ )
1412
+ request_id: str = Field(
1413
+ default_factory=lambda: f"{random_uuid()}",
1414
+ description=(
1415
+ "The request_id related to this request. If the caller does "
1416
+ "not set it, a random_uuid will be generated. This id is used "
1417
+ "through out the inference process and return in response."),
1418
+ )
1419
+ normalize: Optional[bool] = None
1420
+
1421
+ # --8<-- [end:embedding-extra-params]
1422
+
1423
+ def to_pooling_params(self):
1424
+ return PoolingParams(
1425
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1426
+ dimensions=self.dimensions,
1427
+ normalize=self.normalize)
1428
+
1429
+
1430
+ class EmbeddingChatRequest(OpenAIBaseModel):
1431
+ model: Optional[str] = None
1432
+ messages: list[ChatCompletionMessageParam]
1433
+
1434
+ encoding_format: Literal["float", "base64"] = "float"
1435
+ dimensions: Optional[int] = None
1436
+ user: Optional[str] = None
1437
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1438
+
1439
+ # --8<-- [start:chat-embedding-extra-params]
1440
+ add_generation_prompt: bool = Field(
1441
+ default=False,
1442
+ description=
1443
+ ("If true, the generation prompt will be added to the chat template. "
1444
+ "This is a parameter used by chat template in tokenizer config of the "
1445
+ "model."),
1446
+ )
1447
+
1448
+ add_special_tokens: bool = Field(
1449
+ default=False,
1450
+ description=(
1451
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1452
+ "on top of what is added by the chat template. "
1453
+ "For most models, the chat template takes care of adding the "
1454
+ "special tokens so this should be set to false (as is the "
1455
+ "default)."),
1456
+ )
1457
+ chat_template: Optional[str] = Field(
1458
+ default=None,
1459
+ description=(
1460
+ "A Jinja template to use for this conversion. "
1461
+ "As of transformers v4.44, default chat template is no longer "
1462
+ "allowed, so you must provide a chat template if the tokenizer "
1463
+ "does not define one."),
1464
+ )
1465
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1466
+ default=None,
1467
+ description=(
1468
+ "Additional keyword args to pass to the template renderer. "
1469
+ "Will be accessible by the chat template."),
1470
+ )
1471
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1472
+ default=None,
1473
+ description=("Additional kwargs to pass to the HF processor."),
1474
+ )
1475
+ priority: int = Field(
1476
+ default=0,
1477
+ description=(
1478
+ "The priority of the request (lower means earlier handling; "
1479
+ "default: 0). Any priority other than 0 will raise an error "
1480
+ "if the served model does not use priority scheduling."),
1481
+ )
1482
+ request_id: str = Field(
1483
+ default_factory=lambda: f"{random_uuid()}",
1484
+ description=(
1485
+ "The request_id related to this request. If the caller does "
1486
+ "not set it, a random_uuid will be generated. This id is used "
1487
+ "through out the inference process and return in response."),
1488
+ )
1489
+ normalize: Optional[bool] = None
1490
+ # --8<-- [end:chat-embedding-extra-params]
1491
+
1492
+ @model_validator(mode="before")
1493
+ @classmethod
1494
+ def check_generation_prompt(cls, data):
1495
+ if data.get("continue_final_message") and data.get(
1496
+ "add_generation_prompt"):
1497
+ raise ValueError("Cannot set both `continue_final_message` and "
1498
+ "`add_generation_prompt` to True.")
1499
+ return data
1500
+
1501
+ def to_pooling_params(self):
1502
+ return PoolingParams(
1503
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1504
+ dimensions=self.dimensions,
1505
+ normalize=self.normalize)
1506
+
1507
+
1508
+ EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
1509
+
1510
+ PoolingCompletionRequest = EmbeddingCompletionRequest
1511
+ PoolingChatRequest = EmbeddingChatRequest
1512
+
1513
+ T = TypeVar("T")
1514
+
1515
+
1516
+ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
1517
+ model: Optional[str] = None
1518
+
1519
+ priority: int = Field(default=0)
1520
+ """
1521
+ The priority of the request (lower means earlier handling;
1522
+ default: 0). Any priority other than 0 will raise an error
1523
+ if the served model does not use priority scheduling.
1524
+ """
1525
+ data: T
1526
+ """
1527
+ When using plugins IOProcessor plugins, the actual input is processed
1528
+ by the plugin itself. Hence, we use a generic type for the request data
1529
+ """
1530
+ softmax: bool = True
1531
+
1532
+ def to_pooling_params(self):
1533
+ return PoolingParams(task="encode", softmax=self.softmax)
1534
+
1535
+
1536
+ class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
1537
+
1538
+ request_id: Optional[str] = None
1539
+ """
1540
+ The request_id associated with this response
1541
+ """
1542
+ created_at: int = Field(default_factory=lambda: int(time.time()))
1543
+
1544
+ data: T
1545
+ """
1546
+ When using plugins IOProcessor plugins, the actual output is generated
1547
+ by the plugin itself. Hence, we use a generic type for the response data
1548
+ """
1549
+
1550
+
1551
+ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest,
1552
+ IOProcessorRequest]
1553
+
1554
+
1555
+ class ScoreRequest(OpenAIBaseModel):
1556
+ model: Optional[str] = None
1557
+ text_1: Union[list[str], str, ScoreMultiModalParam]
1558
+ text_2: Union[list[str], str, ScoreMultiModalParam]
1559
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1560
+
1561
+ # --8<-- [start:score-extra-params]
1562
+
1563
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1564
+ default=None,
1565
+ description=("Additional kwargs to pass to the HF processor."),
1566
+ )
1567
+
1568
+ priority: int = Field(
1569
+ default=0,
1570
+ description=(
1571
+ "The priority of the request (lower means earlier handling; "
1572
+ "default: 0). Any priority other than 0 will raise an error "
1573
+ "if the served model does not use priority scheduling."),
1574
+ )
1575
+
1576
+ activation: Optional[bool] = None
1577
+
1578
+ # --8<-- [end:score-extra-params]
1579
+
1580
+ def to_pooling_params(self):
1581
+ return PoolingParams(
1582
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1583
+ activation=self.activation)
1584
+
1585
+
1586
+ class RerankRequest(OpenAIBaseModel):
1587
+ model: Optional[str] = None
1588
+ query: Union[str, ScoreMultiModalParam]
1589
+ documents: Union[list[str], ScoreMultiModalParam]
1590
+ top_n: int = Field(default_factory=lambda: 0)
1591
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1592
+
1593
+ # --8<-- [start:rerank-extra-params]
1594
+
1595
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1596
+ default=None,
1597
+ description=("Additional kwargs to pass to the HF processor."),
1598
+ )
1599
+
1600
+ priority: int = Field(
1601
+ default=0,
1602
+ description=(
1603
+ "The priority of the request (lower means earlier handling; "
1604
+ "default: 0). Any priority other than 0 will raise an error "
1605
+ "if the served model does not use priority scheduling."),
1606
+ )
1607
+
1608
+ activation: Optional[bool] = None
1609
+
1610
+ # --8<-- [end:rerank-extra-params]
1611
+
1612
+ def to_pooling_params(self):
1613
+ return PoolingParams(
1614
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1615
+ activation=self.activation)
1616
+
1617
+
1618
+ class RerankDocument(BaseModel):
1619
+ text: Optional[str] = None
1620
+ multi_modal: Optional[ScoreContentPartParam] = None
1621
+
1622
+
1623
+ class RerankResult(BaseModel):
1624
+ index: int
1625
+ document: RerankDocument
1626
+ relevance_score: float
1627
+
1628
+
1629
+ class RerankUsage(BaseModel):
1630
+ total_tokens: int
1631
+
1632
+
1633
+ class RerankResponse(OpenAIBaseModel):
1634
+ id: str
1635
+ model: str
1636
+ usage: RerankUsage
1637
+ results: list[RerankResult]
1638
+
1639
+
1640
+ class CompletionLogProbs(OpenAIBaseModel):
1641
+ text_offset: list[int] = Field(default_factory=list)
1642
+ token_logprobs: list[Optional[float]] = Field(default_factory=list)
1643
+ tokens: list[str] = Field(default_factory=list)
1644
+ top_logprobs: list[Optional[dict[str,
1645
+ float]]] = Field(default_factory=list)
1646
+
1647
+
1648
+ class CompletionResponseChoice(OpenAIBaseModel):
1649
+ index: int
1650
+ text: str
1651
+ logprobs: Optional[CompletionLogProbs] = None
1652
+ finish_reason: Optional[str] = None
1653
+ stop_reason: Optional[Union[int, str]] = Field(
1654
+ default=None,
1655
+ description=(
1656
+ "The stop string or token id that caused the completion "
1657
+ "to stop, None if the completion finished for some other reason "
1658
+ "including encountering the EOS token"),
1659
+ )
1660
+ token_ids: Optional[list[int]] = None # For response
1661
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1662
+ prompt_token_ids: Optional[list[int]] = None # For prompt
1663
+
1664
+
1665
+ class CompletionResponse(OpenAIBaseModel):
1666
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1667
+ object: Literal["text_completion"] = "text_completion"
1668
+ created: int = Field(default_factory=lambda: int(time.time()))
1669
+ model: str
1670
+ choices: list[CompletionResponseChoice]
1671
+ service_tier: Optional[Literal["auto", "default", "flex", "scale",
1672
+ "priority"]] = None
1673
+ system_fingerprint: Optional[str] = None
1674
+ usage: UsageInfo
1675
+
1676
+ # vLLM-specific fields that are not in OpenAI spec
1677
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1678
+ default=None, description="KVTransfer parameters.")
1679
+
1680
+
1681
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1682
+ index: int
1683
+ text: str
1684
+ logprobs: Optional[CompletionLogProbs] = None
1685
+ finish_reason: Optional[str] = None
1686
+ stop_reason: Optional[Union[int, str]] = Field(
1687
+ default=None,
1688
+ description=(
1689
+ "The stop string or token id that caused the completion "
1690
+ "to stop, None if the completion finished for some other reason "
1691
+ "including encountering the EOS token"),
1692
+ )
1693
+ # not part of the OpenAI spec but for tracing the tokens
1694
+ # prompt tokens is put into choice to align with CompletionResponseChoice
1695
+ prompt_token_ids: Optional[list[int]] = None
1696
+ token_ids: Optional[list[int]] = None
1697
+
1698
+
1699
+ class CompletionStreamResponse(OpenAIBaseModel):
1700
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1701
+ object: str = "text_completion"
1702
+ created: int = Field(default_factory=lambda: int(time.time()))
1703
+ model: str
1704
+ choices: list[CompletionResponseStreamChoice]
1705
+ usage: Optional[UsageInfo] = Field(default=None)
1706
+
1707
+
1708
+ class EmbeddingResponseData(OpenAIBaseModel):
1709
+ index: int
1710
+ object: str = "embedding"
1711
+ embedding: Union[list[float], str]
1712
+
1713
+
1714
+ class EmbeddingResponse(OpenAIBaseModel):
1715
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1716
+ object: str = "list"
1717
+ created: int = Field(default_factory=lambda: int(time.time()))
1718
+ model: str
1719
+ data: list[EmbeddingResponseData]
1720
+ usage: UsageInfo
1721
+
1722
+
1723
+ class PoolingResponseData(OpenAIBaseModel):
1724
+ index: int
1725
+ object: str = "pooling"
1726
+ data: Union[list[list[float]], list[float], str]
1727
+
1728
+
1729
+ class PoolingResponse(OpenAIBaseModel):
1730
+ id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
1731
+ object: str = "list"
1732
+ created: int = Field(default_factory=lambda: int(time.time()))
1733
+ model: str
1734
+ data: list[PoolingResponseData]
1735
+ usage: UsageInfo
1736
+
1737
+
1738
+ class ScoreResponseData(OpenAIBaseModel):
1739
+ index: int
1740
+ object: str = "score"
1741
+ score: float
1742
+
1743
+
1744
+ class ScoreResponse(OpenAIBaseModel):
1745
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1746
+ object: str = "list"
1747
+ created: int = Field(default_factory=lambda: int(time.time()))
1748
+ model: str
1749
+ data: list[ScoreResponseData]
1750
+ usage: UsageInfo
1751
+
1752
+
1753
+ class ClassificationRequest(OpenAIBaseModel):
1754
+ model: Optional[str] = None
1755
+ input: Union[list[str], str]
1756
+ truncate_prompt_tokens: Optional[int] = None
1757
+ user: Optional[str] = None
1758
+
1759
+ # --8<-- [start:classification-extra-params]
1760
+ priority: int = Field(
1761
+ default=0,
1762
+ description=(
1763
+ "The priority of the request (lower means earlier handling; "
1764
+ "default: 0). Any priority other than 0 will raise an error "
1765
+ "if the served model does not use priority scheduling."),
1766
+ )
1767
+
1768
+ activation: Optional[bool] = None
1769
+
1770
+ # --8<-- [end:classification-extra-params]
1771
+
1772
+ def to_pooling_params(self):
1773
+ return PoolingParams(
1774
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1775
+ activation=self.activation)
1776
+
1777
+
1778
+ class ClassificationData(OpenAIBaseModel):
1779
+ index: int
1780
+ label: Optional[str]
1781
+ probs: list[float]
1782
+ num_classes: int
1783
+
1784
+
1785
+ class ClassificationResponse(OpenAIBaseModel):
1786
+ id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
1787
+ object: str = "list"
1788
+ created: int = Field(default_factory=lambda: int(time.time()))
1789
+ model: str
1790
+ data: list[ClassificationData]
1791
+ usage: UsageInfo
1792
+
1793
+
1794
+ class FunctionCall(OpenAIBaseModel):
1795
+ name: str
1796
+ arguments: str
1797
+
1798
+
1799
+ class ToolCall(OpenAIBaseModel):
1800
+ id: str = Field(default_factory=make_tool_call_id)
1801
+ type: Literal["function"] = "function"
1802
+ function: FunctionCall
1803
+
1804
+
1805
+ class DeltaFunctionCall(BaseModel):
1806
+ name: Optional[str] = None
1807
+ arguments: Optional[str] = None
1808
+
1809
+
1810
+ # a tool call delta where everything is optional
1811
+ class DeltaToolCall(OpenAIBaseModel):
1812
+ id: Optional[str] = None
1813
+ type: Optional[Literal["function"]] = None
1814
+ index: int
1815
+ function: Optional[DeltaFunctionCall] = None
1816
+
1817
+
1818
+ class ExtractedToolCallInformation(BaseModel):
1819
+ # indicate if tools were called
1820
+ tools_called: bool
1821
+
1822
+ # extracted tool calls
1823
+ tool_calls: list[ToolCall]
1824
+
1825
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1826
+ # But some models will do this intentionally
1827
+ content: Optional[str] = None
1828
+
1829
+
1830
+ class ChatMessage(OpenAIBaseModel):
1831
+ role: str
1832
+ content: Optional[str] = None
1833
+ refusal: Optional[str] = None
1834
+ annotations: Optional[OpenAIAnnotation] = None
1835
+ audio: Optional[OpenAIChatCompletionAudio] = None
1836
+ function_call: Optional[FunctionCall] = None
1837
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1838
+
1839
+ # vLLM-specific fields that are not in OpenAI spec
1840
+ reasoning_content: Optional[str] = None
1841
+
1842
+
1843
+ class ChatCompletionLogProb(OpenAIBaseModel):
1844
+ token: str
1845
+ logprob: float = -9999.0
1846
+ bytes: Optional[list[int]] = None
1847
+
1848
+
1849
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1850
+ # Workaround: redefine fields name cache so that it's not
1851
+ # shared with the super class.
1852
+ field_names: ClassVar[Optional[set[str]]] = None
1853
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1854
+
1855
+
1856
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1857
+ content: Optional[list[ChatCompletionLogProbsContent]] = None
1858
+
1859
+
1860
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1861
+ index: int
1862
+ message: ChatMessage
1863
+ logprobs: Optional[ChatCompletionLogProbs] = None
1864
+ # per OpenAI spec this is the default
1865
+ finish_reason: Optional[str] = "stop"
1866
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1867
+ stop_reason: Optional[Union[int, str]] = None
1868
+ # not part of the OpenAI spec but is useful for tracing the tokens
1869
+ # in agent scenarios
1870
+ token_ids: Optional[list[int]] = None
1871
+
1872
+
1873
+ class ChatCompletionResponse(OpenAIBaseModel):
1874
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1875
+ object: Literal["chat.completion"] = "chat.completion"
1876
+ created: int = Field(default_factory=lambda: int(time.time()))
1877
+ model: str
1878
+ choices: list[ChatCompletionResponseChoice]
1879
+ service_tier: Optional[Literal["auto", "default", "flex", "scale",
1880
+ "priority"]] = None
1881
+ system_fingerprint: Optional[str] = None
1882
+ usage: UsageInfo
1883
+
1884
+ # vLLM-specific fields that are not in OpenAI spec
1885
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1886
+ prompt_token_ids: Optional[list[int]] = None
1887
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1888
+ default=None, description="KVTransfer parameters.")
1889
+
1890
+
1891
+ class DeltaMessage(OpenAIBaseModel):
1892
+ role: Optional[str] = None
1893
+ content: Optional[str] = None
1894
+ reasoning_content: Optional[str] = None
1895
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1896
+
1897
+
1898
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1899
+ index: int
1900
+ delta: DeltaMessage
1901
+ logprobs: Optional[ChatCompletionLogProbs] = None
1902
+ finish_reason: Optional[str] = None
1903
+ stop_reason: Optional[Union[int, str]] = None
1904
+ # not part of the OpenAI spec but for tracing the tokens
1905
+ token_ids: Optional[list[int]] = None
1906
+
1907
+
1908
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1909
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1910
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1911
+ created: int = Field(default_factory=lambda: int(time.time()))
1912
+ model: str
1913
+ choices: list[ChatCompletionResponseStreamChoice]
1914
+ usage: Optional[UsageInfo] = Field(default=None)
1915
+ # not part of the OpenAI spec but for tracing the tokens
1916
+ prompt_token_ids: Optional[list[int]] = None
1917
+
1918
+
1919
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1920
+ delta: DeltaMessage
1921
+ finish_reason: Optional[str] = None
1922
+ stop_reason: Optional[Union[int, str]] = None
1923
+
1924
+
1925
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1926
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1927
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1928
+ created: int = Field(default_factory=lambda: int(time.time()))
1929
+ model: str
1930
+ choices: list[TranscriptionResponseStreamChoice]
1931
+ usage: Optional[UsageInfo] = Field(default=None)
1932
+
1933
+
1934
+ class InputTokensDetails(OpenAIBaseModel):
1935
+ cached_tokens: int
1936
+
1937
+
1938
+ class OutputTokensDetails(OpenAIBaseModel):
1939
+ reasoning_tokens: int = 0
1940
+ tool_output_tokens: int = 0
1941
+
1942
+
1943
+ class ResponseUsage(OpenAIBaseModel):
1944
+ input_tokens: int
1945
+ input_tokens_details: InputTokensDetails
1946
+ output_tokens: int
1947
+ output_tokens_details: OutputTokensDetails
1948
+ total_tokens: int
1949
+
1950
+
1951
+ class ResponsesResponse(OpenAIBaseModel):
1952
+ id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
1953
+ created_at: int = Field(default_factory=lambda: int(time.time()))
1954
+ # error: Optional[ResponseError] = None
1955
+ incomplete_details: Optional[IncompleteDetails] = None
1956
+ instructions: Optional[str] = None
1957
+ metadata: Optional[Metadata] = None
1958
+ model: str
1959
+ object: Literal["response"] = "response"
1960
+ output: list[ResponseOutputItem]
1961
+ # These are populated when enable_response_messages is set to True
1962
+ # TODO: Currently an issue where content of harmony messages
1963
+ # is not available when these are serialized. Metadata is available
1964
+ input_messages: Optional[list[ChatCompletionMessageParam]] = None
1965
+ output_messages: Optional[list[ChatCompletionMessageParam]] = None
1966
+ parallel_tool_calls: bool
1967
+ temperature: float
1968
+ tool_choice: ToolChoice
1969
+ tools: list[Tool]
1970
+ top_p: float
1971
+ background: bool
1972
+ max_output_tokens: int
1973
+ max_tool_calls: Optional[int] = None
1974
+ previous_response_id: Optional[str] = None
1975
+ prompt: Optional[ResponsePrompt] = None
1976
+ reasoning: Optional[Reasoning] = None
1977
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"]
1978
+ status: ResponseStatus
1979
+ text: Optional[ResponseTextConfig] = None
1980
+ top_logprobs: Optional[int] = None
1981
+ truncation: Literal["auto", "disabled"]
1982
+ usage: Optional[ResponseUsage] = None
1983
+ user: Optional[str] = None
1984
+
1985
+ @classmethod
1986
+ def from_request(
1987
+ cls,
1988
+ request: ResponsesRequest,
1989
+ sampling_params: SamplingParams,
1990
+ model_name: str,
1991
+ created_time: int,
1992
+ output: list[ResponseOutputItem],
1993
+ status: ResponseStatus,
1994
+ usage: Optional[ResponseUsage] = None,
1995
+ input_messages: Optional[list[ChatCompletionMessageParam]] = None,
1996
+ output_messages: Optional[list[ChatCompletionMessageParam]] = None,
1997
+ ) -> "ResponsesResponse":
1998
+
1999
+ incomplete_details: Optional[IncompleteDetails] = None
2000
+ if status == 'incomplete':
2001
+ incomplete_details = IncompleteDetails(reason='max_output_tokens')
2002
+ # TODO: implement the other reason for incomplete_details,
2003
+ # which is content_filter
2004
+ # incomplete_details = IncompleteDetails(reason='content_filter')
2005
+ return cls(
2006
+ id=request.request_id,
2007
+ created_at=created_time,
2008
+ incomplete_details=incomplete_details,
2009
+ instructions=request.instructions,
2010
+ metadata=request.metadata,
2011
+ model=model_name,
2012
+ output=output,
2013
+ input_messages=input_messages,
2014
+ output_messages=output_messages,
2015
+ parallel_tool_calls=request.parallel_tool_calls,
2016
+ temperature=sampling_params.temperature,
2017
+ tool_choice=request.tool_choice,
2018
+ tools=request.tools,
2019
+ top_p=sampling_params.top_p,
2020
+ background=request.background,
2021
+ max_output_tokens=sampling_params.max_tokens,
2022
+ max_tool_calls=request.max_tool_calls,
2023
+ previous_response_id=request.previous_response_id,
2024
+ prompt=request.prompt,
2025
+ reasoning=request.reasoning,
2026
+ service_tier=request.service_tier,
2027
+ status=status,
2028
+ text=request.text,
2029
+ top_logprobs=sampling_params.logprobs,
2030
+ truncation=request.truncation,
2031
+ user=request.user,
2032
+ usage=usage,
2033
+ )
2034
+
2035
+
2036
+ # TODO: this code can be removed once
2037
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
2038
+ class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
2039
+ content_index: int
2040
+ """The index of the content part that is done."""
2041
+
2042
+ item_id: str
2043
+ """The ID of the output item that the content part was added to."""
2044
+
2045
+ output_index: int
2046
+ """The index of the output item that the content part was added to."""
2047
+
2048
+ part: ResponseReasoningTextContent
2049
+ """The content part that is done."""
2050
+
2051
+ sequence_number: int
2052
+ """The sequence number of this event."""
2053
+
2054
+ type: Literal["response.reasoning_part.done"]
2055
+ """The type of the event. Always `response.reasoning_part.done`."""
2056
+
2057
+
2058
+ # TODO: this code can be removed once
2059
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
2060
+ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
2061
+ content_index: int
2062
+ """The index of the content part that is done."""
2063
+
2064
+ item_id: str
2065
+ """The ID of the output item that the content part was added to."""
2066
+
2067
+ output_index: int
2068
+ """The index of the output item that the content part was added to."""
2069
+
2070
+ part: ResponseReasoningTextContent
2071
+ """The content part that is done."""
2072
+
2073
+ sequence_number: int
2074
+ """The sequence number of this event."""
2075
+
2076
+ type: Literal["response.reasoning_part.added"]
2077
+ """The type of the event. Always `response.reasoning_part.added`."""
2078
+
2079
+
2080
+ StreamingResponsesResponse: TypeAlias = Union[
2081
+ ResponseCreatedEvent,
2082
+ ResponseInProgressEvent,
2083
+ ResponseCompletedEvent,
2084
+ ResponseOutputItemAddedEvent,
2085
+ ResponseOutputItemDoneEvent,
2086
+ ResponseContentPartAddedEvent,
2087
+ ResponseContentPartDoneEvent,
2088
+ ResponseReasoningTextDeltaEvent,
2089
+ ResponseReasoningTextDoneEvent,
2090
+ ResponseReasoningPartAddedEvent,
2091
+ ResponseReasoningPartDoneEvent,
2092
+ ResponseCodeInterpreterCallInProgressEvent,
2093
+ ResponseCodeInterpreterCallCodeDeltaEvent,
2094
+ ResponseWebSearchCallInProgressEvent,
2095
+ ResponseWebSearchCallSearchingEvent,
2096
+ ResponseWebSearchCallCompletedEvent,
2097
+ ResponseCodeInterpreterCallCodeDoneEvent,
2098
+ ResponseCodeInterpreterCallInterpretingEvent,
2099
+ ResponseCodeInterpreterCallCompletedEvent,
2100
+ ]
2101
+
2102
+ BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
2103
+ ScoreRequest, RerankRequest]
2104
+
2105
+
2106
+ class BatchRequestInput(OpenAIBaseModel):
2107
+ """
2108
+ The per-line object of the batch input file.
2109
+
2110
+ NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
2111
+ """
2112
+
2113
+ # A developer-provided per-request id that will be used to match outputs to
2114
+ # inputs. Must be unique for each request in a batch.
2115
+ custom_id: str
2116
+
2117
+ # The HTTP method to be used for the request. Currently only POST is
2118
+ # supported.
2119
+ method: str
2120
+
2121
+ # The OpenAI API relative URL to be used for the request. Currently
2122
+ # /v1/chat/completions is supported.
2123
+ url: str
2124
+
2125
+ # The parameters of the request.
2126
+ body: BatchRequestInputBody
2127
+
2128
+ @field_validator('body', mode='plain')
2129
+ @classmethod
2130
+ def check_type_for_url(cls, value: Any, info: ValidationInfo):
2131
+ # Use url to disambiguate models
2132
+ url: str = info.data["url"]
2133
+ if url == "/v1/chat/completions":
2134
+ return ChatCompletionRequest.model_validate(value)
2135
+ if url == "/v1/embeddings":
2136
+ return TypeAdapter(EmbeddingRequest).validate_python(value)
2137
+ if url.endswith("/score"):
2138
+ return ScoreRequest.model_validate(value)
2139
+ if url.endswith("/rerank"):
2140
+ return RerankRequest.model_validate(value)
2141
+ return TypeAdapter(BatchRequestInputBody).validate_python(value)
2142
+
2143
+
2144
+ class BatchResponseData(OpenAIBaseModel):
2145
+ # HTTP status code of the response.
2146
+ status_code: int = 200
2147
+
2148
+ # An unique identifier for the API request.
2149
+ request_id: str
2150
+
2151
+ # The body of the response.
2152
+ body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
2153
+ ScoreResponse, RerankResponse]] = None
2154
+
2155
+
2156
+ class BatchRequestOutput(OpenAIBaseModel):
2157
+ """
2158
+ The per-line object of the batch output and error files
2159
+ """
2160
+
2161
+ id: str
2162
+
2163
+ # A developer-provided per-request id that will be used to match outputs to
2164
+ # inputs.
2165
+ custom_id: str
2166
+
2167
+ response: Optional[BatchResponseData]
2168
+
2169
+ # For requests that failed with a non-HTTP error, this will contain more
2170
+ # information on the cause of the failure.
2171
+ error: Optional[Any]
2172
+
2173
+
2174
+ class TokenizeCompletionRequest(OpenAIBaseModel):
2175
+ model: Optional[str] = None
2176
+ prompt: str
2177
+
2178
+ add_special_tokens: bool = Field(
2179
+ default=True,
2180
+ description=(
2181
+ "If true (the default), special tokens (e.g. BOS) will be added to "
2182
+ "the prompt."),
2183
+ )
2184
+ return_token_strs: Optional[bool] = Field(
2185
+ default=False,
2186
+ description=("If true, also return the token strings "
2187
+ "corresponding to the token ids."),
2188
+ )
2189
+
2190
+
2191
+ class TokenizeChatRequest(OpenAIBaseModel):
2192
+ model: Optional[str] = None
2193
+ messages: list[ChatCompletionMessageParam]
2194
+
2195
+ add_generation_prompt: bool = Field(
2196
+ default=True,
2197
+ description=
2198
+ ("If true, the generation prompt will be added to the chat template. "
2199
+ "This is a parameter used by chat template in tokenizer config of the "
2200
+ "model."),
2201
+ )
2202
+ return_token_strs: Optional[bool] = Field(
2203
+ default=False,
2204
+ description=("If true, also return the token strings "
2205
+ "corresponding to the token ids."),
2206
+ )
2207
+ continue_final_message: bool = Field(
2208
+ default=False,
2209
+ description=
2210
+ ("If this is set, the chat will be formatted so that the final "
2211
+ "message in the chat is open-ended, without any EOS tokens. The "
2212
+ "model will continue this message rather than starting a new one. "
2213
+ "This allows you to \"prefill\" part of the model's response for it. "
2214
+ "Cannot be used at the same time as `add_generation_prompt`."),
2215
+ )
2216
+ add_special_tokens: bool = Field(
2217
+ default=False,
2218
+ description=(
2219
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
2220
+ "on top of what is added by the chat template. "
2221
+ "For most models, the chat template takes care of adding the "
2222
+ "special tokens so this should be set to false (as is the "
2223
+ "default)."),
2224
+ )
2225
+ chat_template: Optional[str] = Field(
2226
+ default=None,
2227
+ description=(
2228
+ "A Jinja template to use for this conversion. "
2229
+ "As of transformers v4.44, default chat template is no longer "
2230
+ "allowed, so you must provide a chat template if the tokenizer "
2231
+ "does not define one."),
2232
+ )
2233
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
2234
+ default=None,
2235
+ description=(
2236
+ "Additional keyword args to pass to the template renderer. "
2237
+ "Will be accessible by the chat template."),
2238
+ )
2239
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
2240
+ default=None,
2241
+ description=("Additional kwargs to pass to the HF processor."),
2242
+ )
2243
+ tools: Optional[list[ChatCompletionToolsParam]] = Field(
2244
+ default=None,
2245
+ description=("A list of tools the model may call."),
2246
+ )
2247
+
2248
+ @model_validator(mode="before")
2249
+ @classmethod
2250
+ def check_generation_prompt(cls, data):
2251
+ if data.get("continue_final_message") and data.get(
2252
+ "add_generation_prompt"):
2253
+ raise ValueError("Cannot set both `continue_final_message` and "
2254
+ "`add_generation_prompt` to True.")
2255
+ return data
2256
+
2257
+
2258
+ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
2259
+
2260
+
2261
+ class TokenizeResponse(OpenAIBaseModel):
2262
+ count: int
2263
+ max_model_len: int
2264
+ tokens: list[int]
2265
+ token_strs: Optional[list[str]] = None
2266
+
2267
+
2268
+ class DetokenizeRequest(OpenAIBaseModel):
2269
+ model: Optional[str] = None
2270
+ tokens: list[int]
2271
+
2272
+
2273
+ class DetokenizeResponse(OpenAIBaseModel):
2274
+ prompt: str
2275
+
2276
+
2277
+ class TokenizerInfoResponse(OpenAIBaseModel):
2278
+ """
2279
+ Response containing tokenizer configuration
2280
+ equivalent to tokenizer_config.json
2281
+ """
2282
+
2283
+ model_config = ConfigDict(extra="allow")
2284
+ tokenizer_class: str
2285
+
2286
+
2287
+ class LoadLoRAAdapterRequest(BaseModel):
2288
+ lora_name: str
2289
+ lora_path: str
2290
+
2291
+
2292
+ class UnloadLoRAAdapterRequest(BaseModel):
2293
+ lora_name: str
2294
+ lora_int_id: Optional[int] = Field(default=None)
2295
+
2296
+
2297
+ ## Protocols for Audio
2298
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
2299
+ "vtt"]
2300
+
2301
+
2302
+ class TranscriptionRequest(OpenAIBaseModel):
2303
+ # Ordered by official OpenAI API documentation
2304
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
2305
+
2306
+ file: UploadFile
2307
+ """
2308
+ The audio file object (not file name) to transcribe, in one of these
2309
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2310
+ """
2311
+
2312
+ model: Optional[str] = None
2313
+ """ID of the model to use.
2314
+ """
2315
+
2316
+ language: Optional[str] = None
2317
+ """The language of the input audio.
2318
+
2319
+ Supplying the input language in
2320
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2321
+ will improve accuracy and latency.
2322
+ """
2323
+
2324
+ prompt: str = Field(default="")
2325
+ """An optional text to guide the model's style or continue a previous audio
2326
+ segment.
2327
+
2328
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2329
+ should match the audio language.
2330
+ """
2331
+
2332
+ response_format: AudioResponseFormat = Field(default="json")
2333
+ """
2334
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2335
+ `verbose_json`, or `vtt`.
2336
+ """
2337
+
2338
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
2339
+
2340
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
2341
+ alias="timestamp_granularities[]", default=[])
2342
+ """The timestamp granularities to populate for this transcription.
2343
+
2344
+ `response_format` must be set `verbose_json` to use timestamp granularities.
2345
+ Either or both of these options are supported: `word`, or `segment`. Note:
2346
+ There is no additional latency for segment timestamps, but generating word
2347
+ timestamps incurs additional latency.
2348
+ """
2349
+
2350
+ stream: Optional[bool] = False
2351
+ """When set, it will enable output to be streamed in a similar fashion
2352
+ as the Chat Completion endpoint.
2353
+ """
2354
+ # --8<-- [start:transcription-extra-params]
2355
+ # Flattened stream option to simplify form data.
2356
+ stream_include_usage: Optional[bool] = False
2357
+ stream_continuous_usage_stats: Optional[bool] = False
2358
+
2359
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
2360
+ default=None,
2361
+ description=("Additional request parameters with string or "
2362
+ "numeric values, used by custom extensions."),
2363
+ )
2364
+ # --8<-- [end:transcription-extra-params]
2365
+
2366
+ to_language: Optional[str] = None
2367
+ """The language of the output audio we transcribe to.
2368
+
2369
+ Please note that this is not currently used by supported models at this
2370
+ time, but it is a placeholder for future use, matching translation api.
2371
+ """
2372
+
2373
+ # --8<-- [start:transcription-sampling-params]
2374
+ temperature: float = Field(default=0.0)
2375
+ """The sampling temperature, between 0 and 1.
2376
+
2377
+ Higher values like 0.8 will make the output more random, while lower values
2378
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2379
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2380
+ to automatically increase the temperature until certain thresholds are hit.
2381
+ """
2382
+
2383
+ top_p: Optional[float] = None
2384
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
2385
+ smallest possible set whose cumulative probability exceeds `p`.
2386
+ """
2387
+
2388
+ top_k: Optional[int] = None
2389
+ """Limits sampling to the `k` most probable tokens at each step."""
2390
+
2391
+ min_p: Optional[float] = None
2392
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
2393
+ minimum likelihood threshold during sampling.
2394
+ """
2395
+
2396
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2397
+ """The seed to use for sampling."""
2398
+
2399
+ frequency_penalty: Optional[float] = 0.0
2400
+ """The frequency penalty to use for sampling."""
2401
+
2402
+ repetition_penalty: Optional[float] = None
2403
+ """The repetition penalty to use for sampling."""
2404
+
2405
+ presence_penalty: Optional[float] = 0.0
2406
+ """The presence penalty to use for sampling."""
2407
+ # --8<-- [end:transcription-sampling-params]
2408
+
2409
+ # Default sampling parameters for transcription requests.
2410
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2411
+ "repetition_penalty": 1.0,
2412
+ "temperature": 1.0,
2413
+ "top_p": 1.0,
2414
+ "top_k": 0,
2415
+ "min_p": 0.0,
2416
+ }
2417
+
2418
+ def to_sampling_params(
2419
+ self,
2420
+ default_max_tokens: int,
2421
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
2422
+
2423
+ max_tokens = default_max_tokens
2424
+
2425
+ if default_sampling_params is None:
2426
+ default_sampling_params = {}
2427
+
2428
+ # Default parameters
2429
+ if (temperature := self.temperature) is None:
2430
+ temperature = default_sampling_params.get(
2431
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
2432
+ if (top_p := self.top_p) is None:
2433
+ top_p = default_sampling_params.get(
2434
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
2435
+ if (top_k := self.top_k) is None:
2436
+ top_k = default_sampling_params.get(
2437
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
2438
+ if (min_p := self.min_p) is None:
2439
+ min_p = default_sampling_params.get(
2440
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
2441
+
2442
+ if (repetition_penalty := self.repetition_penalty) is None:
2443
+ repetition_penalty = default_sampling_params.get(
2444
+ "repetition_penalty",
2445
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
2446
+
2447
+ return SamplingParams.from_optional(temperature=temperature,
2448
+ max_tokens=max_tokens,
2449
+ seed=self.seed,
2450
+ top_p=top_p,
2451
+ top_k=top_k,
2452
+ min_p=min_p,
2453
+ frequency_penalty=self.frequency_penalty,
2454
+ repetition_penalty=repetition_penalty,
2455
+ presence_penalty=self.presence_penalty,
2456
+ output_kind=RequestOutputKind.DELTA
2457
+ if self.stream \
2458
+ else RequestOutputKind.FINAL_ONLY,
2459
+ extra_args=self.vllm_xargs)
2460
+
2461
+ @model_validator(mode="before")
2462
+ @classmethod
2463
+ def validate_transcription_request(cls, data):
2464
+ if isinstance(data.get("file"), str):
2465
+ raise HTTPException(
2466
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
2467
+ detail="Expected 'file' to be a file-like object, not 'str'.",
2468
+ )
2469
+
2470
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2471
+ stream = data.get("stream", False)
2472
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2473
+ raise ValueError(
2474
+ "Stream options can only be defined when `stream=True`.")
2475
+
2476
+ return data
2477
+
2478
+
2479
+ # Transcription response objects
2480
+ class TranscriptionUsageAudio(OpenAIBaseModel):
2481
+ type: Literal["duration"] = "duration"
2482
+ seconds: int
2483
+
2484
+
2485
+ class TranscriptionResponse(OpenAIBaseModel):
2486
+ text: str
2487
+ """The transcribed text."""
2488
+ usage: TranscriptionUsageAudio
2489
+
2490
+
2491
+ class TranscriptionWord(OpenAIBaseModel):
2492
+ end: float
2493
+ """End time of the word in seconds."""
2494
+
2495
+ start: float
2496
+ """Start time of the word in seconds."""
2497
+
2498
+ word: str
2499
+ """The text content of the word."""
2500
+
2501
+
2502
+ class TranscriptionSegment(OpenAIBaseModel):
2503
+ id: int
2504
+ """Unique identifier of the segment."""
2505
+
2506
+ avg_logprob: float
2507
+ """Average logprob of the segment.
2508
+
2509
+ If the value is lower than -1, consider the logprobs failed.
2510
+ """
2511
+
2512
+ compression_ratio: float
2513
+ """Compression ratio of the segment.
2514
+
2515
+ If the value is greater than 2.4, consider the compression failed.
2516
+ """
2517
+
2518
+ end: float
2519
+ """End time of the segment in seconds."""
2520
+
2521
+ no_speech_prob: float
2522
+ """Probability of no speech in the segment.
2523
+
2524
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2525
+ this segment silent.
2526
+ """
2527
+
2528
+ seek: int
2529
+ """Seek offset of the segment."""
2530
+
2531
+ start: float
2532
+ """Start time of the segment in seconds."""
2533
+
2534
+ temperature: float
2535
+ """Temperature parameter used for generating the segment."""
2536
+
2537
+ text: str
2538
+ """Text content of the segment."""
2539
+
2540
+ tokens: list[int]
2541
+ """Array of token IDs for the text content."""
2542
+
2543
+
2544
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
2545
+ duration: str
2546
+ """The duration of the input audio."""
2547
+
2548
+ language: str
2549
+ """The language of the input audio."""
2550
+
2551
+ text: str
2552
+ """The transcribed text."""
2553
+
2554
+ segments: Optional[list[TranscriptionSegment]] = None
2555
+ """Segments of the transcribed text and their corresponding details."""
2556
+
2557
+ words: Optional[list[TranscriptionWord]] = None
2558
+ """Extracted words and their corresponding timestamps."""
2559
+
2560
+
2561
+ class TranslationResponseStreamChoice(OpenAIBaseModel):
2562
+ delta: DeltaMessage
2563
+ finish_reason: Optional[str] = None
2564
+ stop_reason: Optional[Union[int, str]] = None
2565
+
2566
+
2567
+ class TranslationStreamResponse(OpenAIBaseModel):
2568
+ id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
2569
+ object: Literal["translation.chunk"] = "translation.chunk"
2570
+ created: int = Field(default_factory=lambda: int(time.time()))
2571
+ model: str
2572
+ choices: list[TranslationResponseStreamChoice]
2573
+ usage: Optional[UsageInfo] = Field(default=None)
2574
+
2575
+
2576
+ class TranslationRequest(OpenAIBaseModel):
2577
+ # Ordered by official OpenAI API documentation
2578
+ # https://platform.openai.com/docs/api-reference/audio/createTranslation
2579
+
2580
+ file: UploadFile
2581
+ """
2582
+ The audio file object (not file name) to translate, in one of these
2583
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2584
+ """
2585
+
2586
+ model: Optional[str] = None
2587
+ """ID of the model to use.
2588
+ """
2589
+
2590
+ prompt: str = Field(default="")
2591
+ """An optional text to guide the model's style or continue a previous audio
2592
+ segment.
2593
+
2594
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2595
+ should match the audio language.
2596
+ """
2597
+
2598
+ response_format: AudioResponseFormat = Field(default="json")
2599
+ """
2600
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2601
+ `verbose_json`, or `vtt`.
2602
+ """
2603
+
2604
+ # TODO support additional sampling parameters
2605
+ # --8<-- [start:translation-sampling-params]
2606
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2607
+ """The seed to use for sampling."""
2608
+
2609
+ temperature: float = Field(default=0.0)
2610
+ """The sampling temperature, between 0 and 1.
2611
+
2612
+ Higher values like 0.8 will make the output more random, while lower values
2613
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2614
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2615
+ to automatically increase the temperature until certain thresholds are hit.
2616
+ """
2617
+ # --8<-- [end:translation-sampling-params]
2618
+
2619
+ # --8<-- [start:translation-extra-params]
2620
+ language: Optional[str] = None
2621
+ """The language of the input audio we translate from.
2622
+
2623
+ Supplying the input language in
2624
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2625
+ will improve accuracy.
2626
+ """
2627
+
2628
+ to_language: Optional[str] = None
2629
+ """The language of the input audio we translate to.
2630
+
2631
+ Please note that this is not supported by all models, refer to the specific
2632
+ model documentation for more details.
2633
+ For instance, Whisper only supports `to_language=en`.
2634
+ """
2635
+
2636
+ stream: Optional[bool] = False
2637
+ """Custom field not present in the original OpenAI definition. When set,
2638
+ it will enable output to be streamed in a similar fashion as the Chat
2639
+ Completion endpoint.
2640
+ """
2641
+ # Flattened stream option to simplify form data.
2642
+ stream_include_usage: Optional[bool] = False
2643
+ stream_continuous_usage_stats: Optional[bool] = False
2644
+ # --8<-- [end:translation-extra-params]
2645
+
2646
+ # Default sampling parameters for translation requests.
2647
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2648
+ "temperature": 0,
2649
+ }
2650
+
2651
+ def to_sampling_params(
2652
+ self,
2653
+ default_max_tokens: int,
2654
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
2655
+
2656
+ max_tokens = default_max_tokens
2657
+
2658
+ if default_sampling_params is None:
2659
+ default_sampling_params = {}
2660
+ # Default parameters
2661
+ if (temperature := self.temperature) is None:
2662
+ temperature = default_sampling_params.get(
2663
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
2664
+
2665
+ return SamplingParams.from_optional(temperature=temperature,
2666
+ max_tokens=max_tokens,
2667
+ seed=self.seed,
2668
+ output_kind=RequestOutputKind.DELTA
2669
+ if self.stream \
2670
+ else RequestOutputKind.FINAL_ONLY)
2671
+
2672
+ @model_validator(mode="before")
2673
+ @classmethod
2674
+ def validate_stream_options(cls, data):
2675
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2676
+ stream = data.get("stream", False)
2677
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2678
+ raise ValueError(
2679
+ "Stream options can only be defined when `stream=True`.")
2680
+
2681
+ return data
2682
+
2683
+
2684
+ # Translation response objects
2685
+ class TranslationResponse(OpenAIBaseModel):
2686
+ text: str
2687
+ """The translated text."""
2688
+
2689
+
2690
+ class TranslationWord(OpenAIBaseModel):
2691
+ end: float
2692
+ """End time of the word in seconds."""
2693
+
2694
+ start: float
2695
+ """Start time of the word in seconds."""
2696
+
2697
+ word: str
2698
+ """The text content of the word."""
2699
+
2700
+
2701
+ class TranslationSegment(OpenAIBaseModel):
2702
+ id: int
2703
+ """Unique identifier of the segment."""
2704
+
2705
+ avg_logprob: float
2706
+ """Average logprob of the segment.
2707
+
2708
+ If the value is lower than -1, consider the logprobs failed.
2709
+ """
2710
+
2711
+ compression_ratio: float
2712
+ """Compression ratio of the segment.
2713
+
2714
+ If the value is greater than 2.4, consider the compression failed.
2715
+ """
2716
+
2717
+ end: float
2718
+ """End time of the segment in seconds."""
2719
+
2720
+ no_speech_prob: float
2721
+ """Probability of no speech in the segment.
2722
+
2723
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2724
+ this segment silent.
2725
+ """
2726
+
2727
+ seek: int
2728
+ """Seek offset of the segment."""
2729
+
2730
+ start: float
2731
+ """Start time of the segment in seconds."""
2732
+
2733
+ temperature: float
2734
+ """Temperature parameter used for generating the segment."""
2735
+
2736
+ text: str
2737
+ """Text content of the segment."""
2738
+
2739
+ tokens: list[int]
2740
+ """Array of token IDs for the text content."""
2741
+
2742
+
2743
+ class TranslationResponseVerbose(OpenAIBaseModel):
2744
+ duration: str
2745
+ """The duration of the input audio."""
2746
+
2747
+ language: str
2748
+ """The language of the input audio."""
2749
+
2750
+ text: str
2751
+ """The translated text."""
2752
+
2753
+ segments: Optional[list[TranslationSegment]] = None
2754
+ """Segments of the translated text and their corresponding details."""
2755
+
2756
+ words: Optional[list[TranslationWord]] = None
2757
+ """Extracted words and their corresponding timestamps."""