vllm-cpu-avx512vnni 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1395) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2022 -0
  5. vllm/_ipex_ops.py +404 -0
  6. vllm/_version.py +34 -0
  7. vllm/adapter_commons/__init__.py +0 -0
  8. vllm/adapter_commons/layers.py +16 -0
  9. vllm/adapter_commons/models.py +106 -0
  10. vllm/adapter_commons/request.py +26 -0
  11. vllm/adapter_commons/utils.py +93 -0
  12. vllm/adapter_commons/worker_manager.py +39 -0
  13. vllm/assets/__init__.py +0 -0
  14. vllm/assets/audio.py +45 -0
  15. vllm/assets/base.py +41 -0
  16. vllm/assets/image.py +50 -0
  17. vllm/assets/video.py +138 -0
  18. vllm/attention/__init__.py +19 -0
  19. vllm/attention/backends/__init__.py +0 -0
  20. vllm/attention/backends/abstract.py +348 -0
  21. vllm/attention/backends/differential_flash_attn.py +935 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1499 -0
  23. vllm/attention/backends/flash_attn.py +933 -0
  24. vllm/attention/backends/flashmla.py +238 -0
  25. vllm/attention/backends/mla/__init__.py +0 -0
  26. vllm/attention/backends/mla/common.py +1310 -0
  27. vllm/attention/backends/placeholder_attn.py +340 -0
  28. vllm/attention/backends/rocm_aiter_mla.py +410 -0
  29. vllm/attention/backends/rocm_flash_attn.py +953 -0
  30. vllm/attention/backends/triton_mla.py +111 -0
  31. vllm/attention/backends/utils.py +610 -0
  32. vllm/attention/backends/xformers.py +805 -0
  33. vllm/attention/layer.py +552 -0
  34. vllm/attention/layers/__init__.py +0 -0
  35. vllm/attention/layers/chunked_local_attention.py +91 -0
  36. vllm/attention/layers/cross_attention.py +159 -0
  37. vllm/attention/layers/encoder_only_attention.py +86 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  40. vllm/attention/ops/common.py +139 -0
  41. vllm/attention/ops/flashmla.py +123 -0
  42. vllm/attention/ops/merge_attn_states.py +43 -0
  43. vllm/attention/ops/paged_attn.py +261 -0
  44. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  45. vllm/attention/ops/prefix_prefill.py +928 -0
  46. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  47. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  48. vllm/attention/ops/triton_decode_attention.py +676 -0
  49. vllm/attention/ops/triton_flash_attention.py +984 -0
  50. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  51. vllm/attention/ops/triton_unified_attention.py +854 -0
  52. vllm/attention/selector.py +243 -0
  53. vllm/attention/utils/__init__.py +0 -0
  54. vllm/attention/utils/fa_utils.py +85 -0
  55. vllm/attention/utils/kv_sharing_utils.py +33 -0
  56. vllm/beam_search.py +87 -0
  57. vllm/benchmarks/__init__.py +0 -0
  58. vllm/benchmarks/datasets.py +2651 -0
  59. vllm/benchmarks/latency.py +170 -0
  60. vllm/benchmarks/lib/__init__.py +3 -0
  61. vllm/benchmarks/lib/endpoint_request_func.py +510 -0
  62. vllm/benchmarks/lib/ready_checker.py +72 -0
  63. vllm/benchmarks/lib/utils.py +80 -0
  64. vllm/benchmarks/serve.py +1247 -0
  65. vllm/benchmarks/throughput.py +696 -0
  66. vllm/collect_env.py +823 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/activation_quant_fusion.py +193 -0
  69. vllm/compilation/backends.py +641 -0
  70. vllm/compilation/base_static_graph.py +51 -0
  71. vllm/compilation/collective_fusion.py +1190 -0
  72. vllm/compilation/compiler_interface.py +572 -0
  73. vllm/compilation/counter.py +47 -0
  74. vllm/compilation/cuda_graph.py +193 -0
  75. vllm/compilation/cuda_piecewise_backend.py +117 -0
  76. vllm/compilation/decorators.py +316 -0
  77. vllm/compilation/fix_functionalization.py +208 -0
  78. vllm/compilation/fusion.py +600 -0
  79. vllm/compilation/fusion_attn.py +303 -0
  80. vllm/compilation/fx_utils.py +84 -0
  81. vllm/compilation/inductor_pass.py +136 -0
  82. vllm/compilation/monitor.py +57 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +165 -0
  85. vllm/compilation/pass_manager.py +88 -0
  86. vllm/compilation/sequence_parallelism.py +484 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +50 -0
  89. vllm/compilation/wrapper.py +138 -0
  90. vllm/config/__init__.py +3921 -0
  91. vllm/config/cache.py +214 -0
  92. vllm/config/compilation.py +580 -0
  93. vllm/config/kv_events.py +50 -0
  94. vllm/config/kv_transfer.py +111 -0
  95. vllm/config/load.py +113 -0
  96. vllm/config/lora.py +132 -0
  97. vllm/config/parallel.py +446 -0
  98. vllm/config/scheduler.py +304 -0
  99. vllm/config/utils.py +29 -0
  100. vllm/connections.py +174 -0
  101. vllm/core/__init__.py +0 -0
  102. vllm/core/block/__init__.py +0 -0
  103. vllm/core/block/block_table.py +399 -0
  104. vllm/core/block/common.py +371 -0
  105. vllm/core/block/cpu_gpu_block_allocator.py +439 -0
  106. vllm/core/block/interfaces.py +319 -0
  107. vllm/core/block/naive_block.py +466 -0
  108. vllm/core/block/prefix_caching_block.py +1135 -0
  109. vllm/core/block/utils.py +28 -0
  110. vllm/core/block_manager.py +523 -0
  111. vllm/core/evictor.py +157 -0
  112. vllm/core/interfaces.py +139 -0
  113. vllm/core/placeholder_block_space_manager.py +103 -0
  114. vllm/core/scheduler.py +2028 -0
  115. vllm/device_allocator/__init__.py +0 -0
  116. vllm/device_allocator/cumem.py +286 -0
  117. vllm/distributed/__init__.py +6 -0
  118. vllm/distributed/communication_op.py +41 -0
  119. vllm/distributed/device_communicators/__init__.py +0 -0
  120. vllm/distributed/device_communicators/all2all.py +259 -0
  121. vllm/distributed/device_communicators/all_reduce_utils.py +292 -0
  122. vllm/distributed/device_communicators/base_device_communicator.py +277 -0
  123. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  124. vllm/distributed/device_communicators/cuda_communicator.py +294 -0
  125. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  126. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  127. vllm/distributed/device_communicators/pynccl.py +290 -0
  128. vllm/distributed/device_communicators/pynccl_wrapper.py +382 -0
  129. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  130. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  131. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  132. vllm/distributed/device_communicators/symm_mem.py +136 -0
  133. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  134. vllm/distributed/device_communicators/xpu_communicator.py +69 -0
  135. vllm/distributed/eplb/__init__.py +8 -0
  136. vllm/distributed/eplb/eplb_state.py +619 -0
  137. vllm/distributed/eplb/rebalance_algo.py +234 -0
  138. vllm/distributed/eplb/rebalance_execute.py +424 -0
  139. vllm/distributed/kv_events.py +362 -0
  140. vllm/distributed/kv_transfer/README.md +29 -0
  141. vllm/distributed/kv_transfer/__init__.py +13 -0
  142. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  143. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  145. vllm/distributed/kv_transfer/kv_connector/factory.py +108 -0
  146. vllm/distributed/kv_transfer/kv_connector/utils.py +246 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/base.py +356 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +266 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1319 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +484 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +542 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +266 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +414 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  158. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  159. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  160. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  161. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  162. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  163. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  164. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  165. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  166. vllm/distributed/parallel_state.py +1489 -0
  167. vllm/distributed/tpu_distributed_utils.py +178 -0
  168. vllm/distributed/utils.py +536 -0
  169. vllm/engine/__init__.py +0 -0
  170. vllm/engine/arg_utils.py +1857 -0
  171. vllm/engine/async_llm_engine.py +1044 -0
  172. vllm/engine/async_timeout.py +173 -0
  173. vllm/engine/llm_engine.py +1849 -0
  174. vllm/engine/metrics.py +577 -0
  175. vllm/engine/metrics_types.py +84 -0
  176. vllm/engine/multiprocessing/__init__.py +145 -0
  177. vllm/engine/multiprocessing/client.py +643 -0
  178. vllm/engine/multiprocessing/engine.py +470 -0
  179. vllm/engine/output_processor/__init__.py +0 -0
  180. vllm/engine/output_processor/interfaces.py +61 -0
  181. vllm/engine/output_processor/single_step.py +145 -0
  182. vllm/engine/output_processor/stop_checker.py +131 -0
  183. vllm/engine/output_processor/util.py +28 -0
  184. vllm/engine/protocol.py +343 -0
  185. vllm/entrypoints/__init__.py +0 -0
  186. vllm/entrypoints/api_server.py +178 -0
  187. vllm/entrypoints/chat_utils.py +1535 -0
  188. vllm/entrypoints/cli/__init__.py +12 -0
  189. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  190. vllm/entrypoints/cli/benchmark/base.py +25 -0
  191. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  192. vllm/entrypoints/cli/benchmark/main.py +58 -0
  193. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  194. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  195. vllm/entrypoints/cli/collect_env.py +36 -0
  196. vllm/entrypoints/cli/main.py +60 -0
  197. vllm/entrypoints/cli/openai.py +214 -0
  198. vllm/entrypoints/cli/run_batch.py +69 -0
  199. vllm/entrypoints/cli/serve.py +232 -0
  200. vllm/entrypoints/cli/types.py +29 -0
  201. vllm/entrypoints/constants.py +10 -0
  202. vllm/entrypoints/context.py +444 -0
  203. vllm/entrypoints/harmony_utils.py +431 -0
  204. vllm/entrypoints/launcher.py +168 -0
  205. vllm/entrypoints/llm.py +1579 -0
  206. vllm/entrypoints/logger.py +79 -0
  207. vllm/entrypoints/openai/__init__.py +0 -0
  208. vllm/entrypoints/openai/api_server.py +2011 -0
  209. vllm/entrypoints/openai/cli_args.py +281 -0
  210. vllm/entrypoints/openai/logits_processors.py +90 -0
  211. vllm/entrypoints/openai/protocol.py +2590 -0
  212. vllm/entrypoints/openai/run_batch.py +497 -0
  213. vllm/entrypoints/openai/serving_chat.py +1591 -0
  214. vllm/entrypoints/openai/serving_classification.py +176 -0
  215. vllm/entrypoints/openai/serving_completion.py +688 -0
  216. vllm/entrypoints/openai/serving_embedding.py +632 -0
  217. vllm/entrypoints/openai/serving_engine.py +996 -0
  218. vllm/entrypoints/openai/serving_models.py +288 -0
  219. vllm/entrypoints/openai/serving_pooling.py +277 -0
  220. vllm/entrypoints/openai/serving_responses.py +1690 -0
  221. vllm/entrypoints/openai/serving_score.py +479 -0
  222. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  223. vllm/entrypoints/openai/serving_transcription.py +136 -0
  224. vllm/entrypoints/openai/speech_to_text.py +388 -0
  225. vllm/entrypoints/openai/tool_parsers/__init__.py +51 -0
  226. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  227. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  228. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  229. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  230. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  231. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  232. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +418 -0
  233. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  234. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  235. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  236. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  237. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  238. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  239. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  240. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  241. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +73 -0
  242. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  243. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  244. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  245. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  246. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  247. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  248. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  249. vllm/entrypoints/renderer.py +395 -0
  250. vllm/entrypoints/score_utils.py +232 -0
  251. vllm/entrypoints/ssl.py +75 -0
  252. vllm/entrypoints/tool.py +139 -0
  253. vllm/entrypoints/tool_server.py +195 -0
  254. vllm/entrypoints/utils.py +328 -0
  255. vllm/env_override.py +23 -0
  256. vllm/envs.py +1354 -0
  257. vllm/executor/__init__.py +0 -0
  258. vllm/executor/executor_base.py +378 -0
  259. vllm/executor/mp_distributed_executor.py +244 -0
  260. vllm/executor/msgspec_utils.py +35 -0
  261. vllm/executor/multiproc_worker_utils.py +279 -0
  262. vllm/executor/ray_distributed_executor.py +699 -0
  263. vllm/executor/ray_utils.py +410 -0
  264. vllm/executor/uniproc_executor.py +152 -0
  265. vllm/forward_context.py +273 -0
  266. vllm/inputs/__init__.py +44 -0
  267. vllm/inputs/data.py +356 -0
  268. vllm/inputs/parse.py +151 -0
  269. vllm/inputs/preprocess.py +973 -0
  270. vllm/inputs/registry.py +251 -0
  271. vllm/logger.py +229 -0
  272. vllm/logging_utils/__init__.py +8 -0
  273. vllm/logging_utils/dump_input.py +81 -0
  274. vllm/logging_utils/formatter.py +79 -0
  275. vllm/logits_process.py +119 -0
  276. vllm/logprobs.py +28 -0
  277. vllm/lora/__init__.py +0 -0
  278. vllm/lora/layers/__init__.py +34 -0
  279. vllm/lora/layers/base.py +69 -0
  280. vllm/lora/layers/base_linear.py +184 -0
  281. vllm/lora/layers/column_parallel_linear.py +622 -0
  282. vllm/lora/layers/logits_processor.py +247 -0
  283. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  284. vllm/lora/layers/replicated_linear.py +61 -0
  285. vllm/lora/layers/row_parallel_linear.py +201 -0
  286. vllm/lora/layers/utils.py +60 -0
  287. vllm/lora/layers/vocal_parallel_embedding.py +172 -0
  288. vllm/lora/lora.py +199 -0
  289. vllm/lora/models.py +792 -0
  290. vllm/lora/ops/__init__.py +0 -0
  291. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  292. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  293. vllm/lora/ops/torch_ops/__init__.py +16 -0
  294. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  295. vllm/lora/ops/triton_ops/__init__.py +12 -0
  296. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  297. vllm/lora/ops/triton_ops/lora_expand_op.py +291 -0
  298. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  299. vllm/lora/ops/triton_ops/lora_shrink_op.py +245 -0
  300. vllm/lora/ops/triton_ops/utils.py +126 -0
  301. vllm/lora/ops/xla_ops/__init__.py +7 -0
  302. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  303. vllm/lora/peft_helper.py +127 -0
  304. vllm/lora/punica_wrapper/__init__.py +10 -0
  305. vllm/lora/punica_wrapper/punica_base.py +458 -0
  306. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  307. vllm/lora/punica_wrapper/punica_gpu.py +279 -0
  308. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  309. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  310. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  311. vllm/lora/punica_wrapper/utils.py +136 -0
  312. vllm/lora/request.py +99 -0
  313. vllm/lora/resolver.py +85 -0
  314. vllm/lora/utils.py +246 -0
  315. vllm/lora/worker_manager.py +256 -0
  316. vllm/model_executor/__init__.py +16 -0
  317. vllm/model_executor/custom_op.py +194 -0
  318. vllm/model_executor/layers/__init__.py +0 -0
  319. vllm/model_executor/layers/activation.py +575 -0
  320. vllm/model_executor/layers/attention_layer_base.py +23 -0
  321. vllm/model_executor/layers/fla/__init__.py +8 -0
  322. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  323. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  324. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  325. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  326. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  327. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  328. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  329. vllm/model_executor/layers/fla/ops/index.py +39 -0
  330. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  331. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  332. vllm/model_executor/layers/fla/ops/op.py +39 -0
  333. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  334. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  335. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  336. vllm/model_executor/layers/fused_moe/__init__.py +80 -0
  337. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +304 -0
  338. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +164 -0
  339. vllm/model_executor/layers/fused_moe/config.py +497 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  560. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +297 -0
  561. vllm/model_executor/layers/fused_moe/cutlass_moe.py +996 -0
  562. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +370 -0
  563. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  564. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +280 -0
  565. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +229 -0
  566. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +243 -0
  567. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +97 -0
  568. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1042 -0
  569. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +240 -0
  570. vllm/model_executor/layers/fused_moe/fused_moe.py +2081 -0
  571. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +247 -0
  572. vllm/model_executor/layers/fused_moe/layer.py +1951 -0
  573. vllm/model_executor/layers/fused_moe/modular_kernel.py +892 -0
  574. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  575. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  576. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  577. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  578. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +321 -0
  579. vllm/model_executor/layers/fused_moe/prepare_finalize.py +72 -0
  580. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +431 -0
  581. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  582. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  583. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +171 -0
  584. vllm/model_executor/layers/fused_moe/trtllm_moe.py +197 -0
  585. vllm/model_executor/layers/fused_moe/utils.py +270 -0
  586. vllm/model_executor/layers/layernorm.py +381 -0
  587. vllm/model_executor/layers/lightning_attn.py +661 -0
  588. vllm/model_executor/layers/linear.py +1567 -0
  589. vllm/model_executor/layers/logits_processor.py +199 -0
  590. vllm/model_executor/layers/mamba/__init__.py +0 -0
  591. vllm/model_executor/layers/mamba/abstract.py +45 -0
  592. vllm/model_executor/layers/mamba/linear_attn.py +432 -0
  593. vllm/model_executor/layers/mamba/mamba2_metadata.py +186 -0
  594. vllm/model_executor/layers/mamba/mamba_mixer.py +517 -0
  595. vllm/model_executor/layers/mamba/mamba_mixer2.py +803 -0
  596. vllm/model_executor/layers/mamba/mamba_utils.py +202 -0
  597. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  598. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +982 -0
  599. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  600. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  601. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  602. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +574 -0
  603. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  604. vllm/model_executor/layers/mamba/ops/ssd_combined.py +248 -0
  605. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +248 -0
  606. vllm/model_executor/layers/mamba/short_conv.py +270 -0
  607. vllm/model_executor/layers/mla.py +158 -0
  608. vllm/model_executor/layers/pooler.py +732 -0
  609. vllm/model_executor/layers/quantization/__init__.py +157 -0
  610. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  611. vllm/model_executor/layers/quantization/awq.py +228 -0
  612. vllm/model_executor/layers/quantization/awq_marlin.py +548 -0
  613. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  614. vllm/model_executor/layers/quantization/base_config.py +164 -0
  615. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  616. vllm/model_executor/layers/quantization/bitsandbytes.py +621 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +795 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1651 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  625. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +161 -0
  626. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  627. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  628. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  629. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +156 -0
  630. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  631. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  632. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +227 -0
  633. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +135 -0
  634. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +21 -0
  635. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  636. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  637. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  638. vllm/model_executor/layers/quantization/deepgemm.py +81 -0
  639. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  640. vllm/model_executor/layers/quantization/experts_int8.py +215 -0
  641. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  642. vllm/model_executor/layers/quantization/fp8.py +1179 -0
  643. vllm/model_executor/layers/quantization/gguf.py +597 -0
  644. vllm/model_executor/layers/quantization/gptq.py +300 -0
  645. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  646. vllm/model_executor/layers/quantization/gptq_marlin.py +700 -0
  647. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  648. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  649. vllm/model_executor/layers/quantization/inc.py +61 -0
  650. vllm/model_executor/layers/quantization/input_quant_fp8.py +103 -0
  651. vllm/model_executor/layers/quantization/ipex_quant.py +410 -0
  652. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  653. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  654. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  655. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  656. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  657. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  658. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  659. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  660. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  661. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  662. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  663. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  664. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  665. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +163 -0
  666. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  667. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  668. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  669. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  670. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  671. vllm/model_executor/layers/quantization/modelopt.py +1548 -0
  672. vllm/model_executor/layers/quantization/moe_wna16.py +473 -0
  673. vllm/model_executor/layers/quantization/mxfp4.py +951 -0
  674. vllm/model_executor/layers/quantization/petit.py +306 -0
  675. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  676. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  677. vllm/model_executor/layers/quantization/quark/quark.py +431 -0
  678. vllm/model_executor/layers/quantization/quark/quark_moe.py +434 -0
  679. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  680. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  681. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +112 -0
  682. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  683. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  684. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  685. vllm/model_executor/layers/quantization/rtn.py +456 -0
  686. vllm/model_executor/layers/quantization/schema.py +86 -0
  687. vllm/model_executor/layers/quantization/torchao.py +214 -0
  688. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  689. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  690. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  691. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  903. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +85 -0
  904. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +258 -0
  905. vllm/model_executor/layers/quantization/utils/fp8_utils.py +795 -0
  906. vllm/model_executor/layers/quantization/utils/gptq_utils.py +96 -0
  907. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  908. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  909. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  910. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  911. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  912. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  913. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  914. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  915. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +132 -0
  916. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  917. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  918. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  919. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  920. vllm/model_executor/layers/quantization/utils/quant_utils.py +627 -0
  921. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  922. vllm/model_executor/layers/resampler.py +270 -0
  923. vllm/model_executor/layers/rotary_embedding/__init__.py +190 -0
  924. vllm/model_executor/layers/rotary_embedding/base.py +156 -0
  925. vllm/model_executor/layers/rotary_embedding/common.py +105 -0
  926. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +140 -0
  927. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  928. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  929. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  930. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  931. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  932. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  933. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  934. vllm/model_executor/layers/rotary_embedding/mrope.py +1140 -0
  935. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  936. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  937. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  938. vllm/model_executor/layers/sampler.py +1198 -0
  939. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  940. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  941. vllm/model_executor/layers/utils.py +196 -0
  942. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  943. vllm/model_executor/model_loader/__init__.py +138 -0
  944. vllm/model_executor/model_loader/base_loader.py +52 -0
  945. vllm/model_executor/model_loader/bitsandbytes_loader.py +787 -0
  946. vllm/model_executor/model_loader/default_loader.py +278 -0
  947. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  948. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  949. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  950. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  951. vllm/model_executor/model_loader/tensorizer.py +743 -0
  952. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  953. vllm/model_executor/model_loader/tpu.py +114 -0
  954. vllm/model_executor/model_loader/utils.py +271 -0
  955. vllm/model_executor/model_loader/weight_utils.py +946 -0
  956. vllm/model_executor/models/__init__.py +30 -0
  957. vllm/model_executor/models/adapters.py +542 -0
  958. vllm/model_executor/models/aimv2.py +246 -0
  959. vllm/model_executor/models/apertus.py +582 -0
  960. vllm/model_executor/models/arcee.py +423 -0
  961. vllm/model_executor/models/arctic.py +560 -0
  962. vllm/model_executor/models/aria.py +662 -0
  963. vllm/model_executor/models/aya_vision.py +470 -0
  964. vllm/model_executor/models/baichuan.py +475 -0
  965. vllm/model_executor/models/bailing_moe.py +529 -0
  966. vllm/model_executor/models/bamba.py +582 -0
  967. vllm/model_executor/models/bart.py +1343 -0
  968. vllm/model_executor/models/bert.py +613 -0
  969. vllm/model_executor/models/bert_with_rope.py +687 -0
  970. vllm/model_executor/models/blip.py +339 -0
  971. vllm/model_executor/models/blip2.py +716 -0
  972. vllm/model_executor/models/bloom.py +374 -0
  973. vllm/model_executor/models/chameleon.py +1141 -0
  974. vllm/model_executor/models/chatglm.py +479 -0
  975. vllm/model_executor/models/clip.py +407 -0
  976. vllm/model_executor/models/cohere2_vision.py +484 -0
  977. vllm/model_executor/models/commandr.py +467 -0
  978. vllm/model_executor/models/config.py +434 -0
  979. vllm/model_executor/models/constant_size_cache.py +137 -0
  980. vllm/model_executor/models/dbrx.py +473 -0
  981. vllm/model_executor/models/deepseek.py +491 -0
  982. vllm/model_executor/models/deepseek_eagle.py +241 -0
  983. vllm/model_executor/models/deepseek_mtp.py +282 -0
  984. vllm/model_executor/models/deepseek_v2.py +1058 -0
  985. vllm/model_executor/models/deepseek_vl2.py +661 -0
  986. vllm/model_executor/models/donut.py +387 -0
  987. vllm/model_executor/models/dots1.py +547 -0
  988. vllm/model_executor/models/ernie45.py +43 -0
  989. vllm/model_executor/models/ernie45_moe.py +608 -0
  990. vllm/model_executor/models/ernie45_vl.py +1510 -0
  991. vllm/model_executor/models/ernie45_vl_moe.py +728 -0
  992. vllm/model_executor/models/ernie_mtp.py +287 -0
  993. vllm/model_executor/models/exaone.py +552 -0
  994. vllm/model_executor/models/exaone4.py +535 -0
  995. vllm/model_executor/models/fairseq2_llama.py +154 -0
  996. vllm/model_executor/models/falcon.py +511 -0
  997. vllm/model_executor/models/falcon_h1.py +739 -0
  998. vllm/model_executor/models/florence2.py +1107 -0
  999. vllm/model_executor/models/fuyu.py +401 -0
  1000. vllm/model_executor/models/gemma.py +428 -0
  1001. vllm/model_executor/models/gemma2.py +425 -0
  1002. vllm/model_executor/models/gemma3.py +542 -0
  1003. vllm/model_executor/models/gemma3_mm.py +723 -0
  1004. vllm/model_executor/models/gemma3n.py +830 -0
  1005. vllm/model_executor/models/gemma3n_mm.py +767 -0
  1006. vllm/model_executor/models/glm.py +23 -0
  1007. vllm/model_executor/models/glm4.py +305 -0
  1008. vllm/model_executor/models/glm4_1v.py +1669 -0
  1009. vllm/model_executor/models/glm4_moe.py +703 -0
  1010. vllm/model_executor/models/glm4_moe_mtp.py +306 -0
  1011. vllm/model_executor/models/glm4v.py +654 -0
  1012. vllm/model_executor/models/gpt2.py +383 -0
  1013. vllm/model_executor/models/gpt_bigcode.py +346 -0
  1014. vllm/model_executor/models/gpt_j.py +340 -0
  1015. vllm/model_executor/models/gpt_neox.py +333 -0
  1016. vllm/model_executor/models/gpt_oss.py +687 -0
  1017. vllm/model_executor/models/granite.py +498 -0
  1018. vllm/model_executor/models/granite_speech.py +799 -0
  1019. vllm/model_executor/models/granitemoe.py +541 -0
  1020. vllm/model_executor/models/granitemoehybrid.py +684 -0
  1021. vllm/model_executor/models/granitemoeshared.py +342 -0
  1022. vllm/model_executor/models/gritlm.py +262 -0
  1023. vllm/model_executor/models/grok1.py +550 -0
  1024. vllm/model_executor/models/h2ovl.py +536 -0
  1025. vllm/model_executor/models/hunyuan_v1.py +937 -0
  1026. vllm/model_executor/models/hyperclovax_vision.py +1206 -0
  1027. vllm/model_executor/models/idefics2_vision_model.py +416 -0
  1028. vllm/model_executor/models/idefics3.py +758 -0
  1029. vllm/model_executor/models/interfaces.py +854 -0
  1030. vllm/model_executor/models/interfaces_base.py +195 -0
  1031. vllm/model_executor/models/intern_vit.py +481 -0
  1032. vllm/model_executor/models/internlm2.py +453 -0
  1033. vllm/model_executor/models/internlm2_ve.py +148 -0
  1034. vllm/model_executor/models/interns1.py +832 -0
  1035. vllm/model_executor/models/interns1_vit.py +418 -0
  1036. vllm/model_executor/models/internvl.py +1423 -0
  1037. vllm/model_executor/models/jais.py +374 -0
  1038. vllm/model_executor/models/jamba.py +630 -0
  1039. vllm/model_executor/models/jina_vl.py +144 -0
  1040. vllm/model_executor/models/keye.py +1684 -0
  1041. vllm/model_executor/models/keye_vl1_5.py +601 -0
  1042. vllm/model_executor/models/kimi_vl.py +620 -0
  1043. vllm/model_executor/models/lfm2.py +558 -0
  1044. vllm/model_executor/models/llama.py +671 -0
  1045. vllm/model_executor/models/llama4.py +732 -0
  1046. vllm/model_executor/models/llama4_eagle.py +241 -0
  1047. vllm/model_executor/models/llama_eagle.py +171 -0
  1048. vllm/model_executor/models/llama_eagle3.py +292 -0
  1049. vllm/model_executor/models/llava.py +872 -0
  1050. vllm/model_executor/models/llava_next.py +572 -0
  1051. vllm/model_executor/models/llava_next_video.py +479 -0
  1052. vllm/model_executor/models/llava_onevision.py +945 -0
  1053. vllm/model_executor/models/mamba.py +310 -0
  1054. vllm/model_executor/models/mamba2.py +346 -0
  1055. vllm/model_executor/models/mamba_cache.py +83 -0
  1056. vllm/model_executor/models/medusa.py +219 -0
  1057. vllm/model_executor/models/midashenglm.py +788 -0
  1058. vllm/model_executor/models/mimo.py +191 -0
  1059. vllm/model_executor/models/mimo_mtp.py +273 -0
  1060. vllm/model_executor/models/minicpm.py +593 -0
  1061. vllm/model_executor/models/minicpm3.py +230 -0
  1062. vllm/model_executor/models/minicpm_eagle.py +391 -0
  1063. vllm/model_executor/models/minicpmo.py +804 -0
  1064. vllm/model_executor/models/minicpmv.py +1786 -0
  1065. vllm/model_executor/models/minimax_cache.py +36 -0
  1066. vllm/model_executor/models/minimax_text_01.py +1027 -0
  1067. vllm/model_executor/models/minimax_vl_01.py +431 -0
  1068. vllm/model_executor/models/mistral3.py +628 -0
  1069. vllm/model_executor/models/mixtral.py +494 -0
  1070. vllm/model_executor/models/mllama.py +1697 -0
  1071. vllm/model_executor/models/mllama4.py +1079 -0
  1072. vllm/model_executor/models/mlp_speculator.py +206 -0
  1073. vllm/model_executor/models/modernbert.py +374 -0
  1074. vllm/model_executor/models/module_mapping.py +72 -0
  1075. vllm/model_executor/models/molmo.py +1569 -0
  1076. vllm/model_executor/models/moonvit.py +663 -0
  1077. vllm/model_executor/models/motif.py +345 -0
  1078. vllm/model_executor/models/mpt.py +332 -0
  1079. vllm/model_executor/models/nano_nemotron_vl.py +1395 -0
  1080. vllm/model_executor/models/nemotron.py +509 -0
  1081. vllm/model_executor/models/nemotron_h.py +633 -0
  1082. vllm/model_executor/models/nemotron_nas.py +484 -0
  1083. vllm/model_executor/models/nemotron_vl.py +655 -0
  1084. vllm/model_executor/models/nvlm_d.py +203 -0
  1085. vllm/model_executor/models/olmo.py +406 -0
  1086. vllm/model_executor/models/olmo2.py +428 -0
  1087. vllm/model_executor/models/olmoe.py +485 -0
  1088. vllm/model_executor/models/opt.py +413 -0
  1089. vllm/model_executor/models/orion.py +350 -0
  1090. vllm/model_executor/models/ovis.py +572 -0
  1091. vllm/model_executor/models/ovis2_5.py +644 -0
  1092. vllm/model_executor/models/paligemma.py +414 -0
  1093. vllm/model_executor/models/persimmon.py +345 -0
  1094. vllm/model_executor/models/phi.py +357 -0
  1095. vllm/model_executor/models/phi3.py +19 -0
  1096. vllm/model_executor/models/phi3v.py +701 -0
  1097. vllm/model_executor/models/phi4_multimodal.py +1478 -0
  1098. vllm/model_executor/models/phi4flash.py +737 -0
  1099. vllm/model_executor/models/phi4mm.py +1281 -0
  1100. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1101. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1102. vllm/model_executor/models/phimoe.py +681 -0
  1103. vllm/model_executor/models/pixtral.py +1348 -0
  1104. vllm/model_executor/models/plamo2.py +1126 -0
  1105. vllm/model_executor/models/qwen.py +363 -0
  1106. vllm/model_executor/models/qwen2.py +526 -0
  1107. vllm/model_executor/models/qwen2_5_omni_thinker.py +985 -0
  1108. vllm/model_executor/models/qwen2_5_vl.py +1256 -0
  1109. vllm/model_executor/models/qwen2_audio.py +492 -0
  1110. vllm/model_executor/models/qwen2_moe.py +558 -0
  1111. vllm/model_executor/models/qwen2_rm.py +122 -0
  1112. vllm/model_executor/models/qwen2_vl.py +1512 -0
  1113. vllm/model_executor/models/qwen3.py +344 -0
  1114. vllm/model_executor/models/qwen3_moe.py +704 -0
  1115. vllm/model_executor/models/qwen3_next.py +1298 -0
  1116. vllm/model_executor/models/qwen3_next_mtp.py +285 -0
  1117. vllm/model_executor/models/qwen_vl.py +795 -0
  1118. vllm/model_executor/models/registry.py +891 -0
  1119. vllm/model_executor/models/roberta.py +252 -0
  1120. vllm/model_executor/models/rvl.py +103 -0
  1121. vllm/model_executor/models/seed_oss.py +488 -0
  1122. vllm/model_executor/models/siglip.py +524 -0
  1123. vllm/model_executor/models/siglip2navit.py +688 -0
  1124. vllm/model_executor/models/skyworkr1v.py +914 -0
  1125. vllm/model_executor/models/smolvlm.py +44 -0
  1126. vllm/model_executor/models/solar.py +506 -0
  1127. vllm/model_executor/models/stablelm.py +344 -0
  1128. vllm/model_executor/models/starcoder2.py +357 -0
  1129. vllm/model_executor/models/step3_text.py +521 -0
  1130. vllm/model_executor/models/step3_vl.py +1091 -0
  1131. vllm/model_executor/models/swin.py +475 -0
  1132. vllm/model_executor/models/tarsier.py +649 -0
  1133. vllm/model_executor/models/telechat2.py +151 -0
  1134. vllm/model_executor/models/teleflm.py +79 -0
  1135. vllm/model_executor/models/terratorch.py +294 -0
  1136. vllm/model_executor/models/transformers.py +883 -0
  1137. vllm/model_executor/models/ultravox.py +667 -0
  1138. vllm/model_executor/models/utils.py +770 -0
  1139. vllm/model_executor/models/vision.py +125 -0
  1140. vllm/model_executor/models/voxtral.py +789 -0
  1141. vllm/model_executor/models/whisper.py +966 -0
  1142. vllm/model_executor/models/zamba2.py +1056 -0
  1143. vllm/model_executor/parameter.py +599 -0
  1144. vllm/model_executor/sampling_metadata.py +597 -0
  1145. vllm/model_executor/utils.py +97 -0
  1146. vllm/model_executor/warmup/__init__.py +0 -0
  1147. vllm/model_executor/warmup/deep_gemm_warmup.py +223 -0
  1148. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1149. vllm/multimodal/__init__.py +35 -0
  1150. vllm/multimodal/audio.py +116 -0
  1151. vllm/multimodal/base.py +219 -0
  1152. vllm/multimodal/cache.py +507 -0
  1153. vllm/multimodal/hasher.py +110 -0
  1154. vllm/multimodal/image.py +130 -0
  1155. vllm/multimodal/inputs.py +979 -0
  1156. vllm/multimodal/parse.py +496 -0
  1157. vllm/multimodal/processing.py +1921 -0
  1158. vllm/multimodal/profiling.py +313 -0
  1159. vllm/multimodal/registry.py +375 -0
  1160. vllm/multimodal/utils.py +754 -0
  1161. vllm/multimodal/video.py +312 -0
  1162. vllm/outputs.py +517 -0
  1163. vllm/platforms/__init__.py +263 -0
  1164. vllm/platforms/cpu.py +353 -0
  1165. vllm/platforms/cuda.py +731 -0
  1166. vllm/platforms/interface.py +599 -0
  1167. vllm/platforms/rocm.py +504 -0
  1168. vllm/platforms/tpu.py +236 -0
  1169. vllm/platforms/xpu.py +243 -0
  1170. vllm/plugins/__init__.py +72 -0
  1171. vllm/plugins/io_processors/__init__.py +68 -0
  1172. vllm/plugins/io_processors/interface.py +67 -0
  1173. vllm/plugins/lora_resolvers/README.md +16 -0
  1174. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1175. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1176. vllm/pooling_params.py +183 -0
  1177. vllm/profiler/__init__.py +0 -0
  1178. vllm/profiler/layerwise_profile.py +375 -0
  1179. vllm/profiler/utils.py +148 -0
  1180. vllm/py.typed +2 -0
  1181. vllm/ray/__init__.py +0 -0
  1182. vllm/ray/lazy_utils.py +22 -0
  1183. vllm/ray/ray_env.py +72 -0
  1184. vllm/reasoning/__init__.py +25 -0
  1185. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1186. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1187. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1188. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1189. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1190. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1191. vllm/reasoning/mistral_reasoning_parser.py +47 -0
  1192. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1193. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1194. vllm/sampling_params.py +577 -0
  1195. vllm/scalar_type.py +349 -0
  1196. vllm/scripts.py +15 -0
  1197. vllm/sequence.py +1465 -0
  1198. vllm/tasks.py +11 -0
  1199. vllm/test_utils.py +130 -0
  1200. vllm/third_party/__init__.py +0 -0
  1201. vllm/third_party/pynvml.py +6140 -0
  1202. vllm/tracing.py +136 -0
  1203. vllm/transformers_utils/__init__.py +24 -0
  1204. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1205. vllm/transformers_utils/chat_templates/registry.py +71 -0
  1206. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1207. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1208. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1209. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1210. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1211. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1212. vllm/transformers_utils/config.py +1043 -0
  1213. vllm/transformers_utils/config_parser_base.py +20 -0
  1214. vllm/transformers_utils/configs/__init__.py +55 -0
  1215. vllm/transformers_utils/configs/arctic.py +207 -0
  1216. vllm/transformers_utils/configs/chatglm.py +72 -0
  1217. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1218. vllm/transformers_utils/configs/eagle.py +84 -0
  1219. vllm/transformers_utils/configs/falcon.py +90 -0
  1220. vllm/transformers_utils/configs/jais.py +238 -0
  1221. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1222. vllm/transformers_utils/configs/medusa.py +63 -0
  1223. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1224. vllm/transformers_utils/configs/mistral.py +165 -0
  1225. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1226. vllm/transformers_utils/configs/moonvit.py +33 -0
  1227. vllm/transformers_utils/configs/nemotron.py +205 -0
  1228. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1229. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1230. vllm/transformers_utils/configs/ovis.py +176 -0
  1231. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1232. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1233. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1234. vllm/transformers_utils/configs/speculators/base.py +91 -0
  1235. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1236. vllm/transformers_utils/configs/ultravox.py +120 -0
  1237. vllm/transformers_utils/detokenizer.py +169 -0
  1238. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1239. vllm/transformers_utils/dynamic_module.py +60 -0
  1240. vllm/transformers_utils/processor.py +245 -0
  1241. vllm/transformers_utils/processors/__init__.py +16 -0
  1242. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1243. vllm/transformers_utils/processors/ovis.py +420 -0
  1244. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1245. vllm/transformers_utils/runai_utils.py +99 -0
  1246. vllm/transformers_utils/s3_utils.py +90 -0
  1247. vllm/transformers_utils/tokenizer.py +293 -0
  1248. vllm/transformers_utils/tokenizer_base.py +149 -0
  1249. vllm/transformers_utils/tokenizer_group.py +132 -0
  1250. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1251. vllm/transformers_utils/tokenizers/mistral.py +520 -0
  1252. vllm/transformers_utils/utils.py +99 -0
  1253. vllm/triton_utils/__init__.py +16 -0
  1254. vllm/triton_utils/importing.py +95 -0
  1255. vllm/usage/__init__.py +0 -0
  1256. vllm/usage/usage_lib.py +259 -0
  1257. vllm/utils/__init__.py +3438 -0
  1258. vllm/utils/deep_gemm.py +212 -0
  1259. vllm/utils/flashinfer.py +372 -0
  1260. vllm/utils/jsontree.py +90 -0
  1261. vllm/utils/tensor_schema.py +236 -0
  1262. vllm/v1/__init__.py +0 -0
  1263. vllm/v1/attention/__init__.py +0 -0
  1264. vllm/v1/attention/backends/__init__.py +0 -0
  1265. vllm/v1/attention/backends/cpu_attn.py +922 -0
  1266. vllm/v1/attention/backends/flash_attn.py +800 -0
  1267. vllm/v1/attention/backends/flashinfer.py +1128 -0
  1268. vllm/v1/attention/backends/flex_attention.py +796 -0
  1269. vllm/v1/attention/backends/gdn_attn.py +320 -0
  1270. vllm/v1/attention/backends/linear_attn.py +68 -0
  1271. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1272. vllm/v1/attention/backends/mamba2_attn.py +224 -0
  1273. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1274. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1275. vllm/v1/attention/backends/mla/common.py +1608 -0
  1276. vllm/v1/attention/backends/mla/cutlass_mla.py +301 -0
  1277. vllm/v1/attention/backends/mla/flashattn_mla.py +273 -0
  1278. vllm/v1/attention/backends/mla/flashinfer_mla.py +110 -0
  1279. vllm/v1/attention/backends/mla/flashmla.py +213 -0
  1280. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1281. vllm/v1/attention/backends/mla/triton_mla.py +175 -0
  1282. vllm/v1/attention/backends/pallas.py +413 -0
  1283. vllm/v1/attention/backends/rocm_aiter_fa.py +548 -0
  1284. vllm/v1/attention/backends/short_conv_attn.py +82 -0
  1285. vllm/v1/attention/backends/tree_attn.py +450 -0
  1286. vllm/v1/attention/backends/triton_attn.py +430 -0
  1287. vllm/v1/attention/backends/utils.py +834 -0
  1288. vllm/v1/attention/backends/xformers.py +437 -0
  1289. vllm/v1/core/__init__.py +0 -0
  1290. vllm/v1/core/block_pool.py +330 -0
  1291. vllm/v1/core/encoder_cache_manager.py +333 -0
  1292. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1293. vllm/v1/core/kv_cache_manager.py +398 -0
  1294. vllm/v1/core/kv_cache_utils.py +1169 -0
  1295. vllm/v1/core/sched/__init__.py +0 -0
  1296. vllm/v1/core/sched/async_scheduler.py +47 -0
  1297. vllm/v1/core/sched/interface.py +158 -0
  1298. vllm/v1/core/sched/output.py +162 -0
  1299. vllm/v1/core/sched/request_queue.py +224 -0
  1300. vllm/v1/core/sched/scheduler.py +1287 -0
  1301. vllm/v1/core/sched/utils.py +69 -0
  1302. vllm/v1/core/single_type_kv_cache_manager.py +670 -0
  1303. vllm/v1/cudagraph_dispatcher.py +121 -0
  1304. vllm/v1/engine/__init__.py +202 -0
  1305. vllm/v1/engine/async_llm.py +757 -0
  1306. vllm/v1/engine/coordinator.py +357 -0
  1307. vllm/v1/engine/core.py +1245 -0
  1308. vllm/v1/engine/core_client.py +1333 -0
  1309. vllm/v1/engine/detokenizer.py +300 -0
  1310. vllm/v1/engine/exceptions.py +17 -0
  1311. vllm/v1/engine/llm_engine.py +332 -0
  1312. vllm/v1/engine/logprobs.py +201 -0
  1313. vllm/v1/engine/output_processor.py +558 -0
  1314. vllm/v1/engine/parallel_sampling.py +133 -0
  1315. vllm/v1/engine/processor.py +524 -0
  1316. vllm/v1/engine/utils.py +857 -0
  1317. vllm/v1/executor/__init__.py +0 -0
  1318. vllm/v1/executor/abstract.py +126 -0
  1319. vllm/v1/executor/multiproc_executor.py +683 -0
  1320. vllm/v1/executor/ray_distributed_executor.py +109 -0
  1321. vllm/v1/kv_cache_interface.py +275 -0
  1322. vllm/v1/metrics/__init__.py +0 -0
  1323. vllm/v1/metrics/loggers.py +717 -0
  1324. vllm/v1/metrics/prometheus.py +82 -0
  1325. vllm/v1/metrics/ray_wrappers.py +133 -0
  1326. vllm/v1/metrics/reader.py +246 -0
  1327. vllm/v1/metrics/stats.py +248 -0
  1328. vllm/v1/outputs.py +147 -0
  1329. vllm/v1/pool/__init__.py +0 -0
  1330. vllm/v1/pool/metadata.py +77 -0
  1331. vllm/v1/request.py +237 -0
  1332. vllm/v1/sample/__init__.py +0 -0
  1333. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1334. vllm/v1/sample/logits_processor/builtin.py +273 -0
  1335. vllm/v1/sample/logits_processor/interface.py +97 -0
  1336. vllm/v1/sample/logits_processor/state.py +161 -0
  1337. vllm/v1/sample/metadata.py +43 -0
  1338. vllm/v1/sample/ops/__init__.py +0 -0
  1339. vllm/v1/sample/ops/bad_words.py +39 -0
  1340. vllm/v1/sample/ops/logprobs.py +26 -0
  1341. vllm/v1/sample/ops/penalties.py +43 -0
  1342. vllm/v1/sample/ops/topk_topp_sampler.py +254 -0
  1343. vllm/v1/sample/rejection_sampler.py +623 -0
  1344. vllm/v1/sample/sampler.py +281 -0
  1345. vllm/v1/sample/tpu/__init__.py +0 -0
  1346. vllm/v1/sample/tpu/metadata.py +124 -0
  1347. vllm/v1/sample/tpu/sampler.py +213 -0
  1348. vllm/v1/serial_utils.py +395 -0
  1349. vllm/v1/spec_decode/__init__.py +0 -0
  1350. vllm/v1/spec_decode/eagle.py +740 -0
  1351. vllm/v1/spec_decode/medusa.py +66 -0
  1352. vllm/v1/spec_decode/metadata.py +62 -0
  1353. vllm/v1/spec_decode/metrics.py +191 -0
  1354. vllm/v1/spec_decode/ngram_proposer.py +157 -0
  1355. vllm/v1/spec_decode/utils.py +14 -0
  1356. vllm/v1/structured_output/__init__.py +297 -0
  1357. vllm/v1/structured_output/backend_guidance.py +245 -0
  1358. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1359. vllm/v1/structured_output/backend_outlines.py +320 -0
  1360. vllm/v1/structured_output/backend_types.py +134 -0
  1361. vllm/v1/structured_output/backend_xgrammar.py +323 -0
  1362. vllm/v1/structured_output/request.py +86 -0
  1363. vllm/v1/structured_output/utils.py +373 -0
  1364. vllm/v1/utils.py +382 -0
  1365. vllm/v1/worker/__init__.py +0 -0
  1366. vllm/v1/worker/block_table.py +221 -0
  1367. vllm/v1/worker/cpu_model_runner.py +163 -0
  1368. vllm/v1/worker/cpu_worker.py +183 -0
  1369. vllm/v1/worker/gpu_input_batch.py +821 -0
  1370. vllm/v1/worker/gpu_model_runner.py +3743 -0
  1371. vllm/v1/worker/gpu_worker.py +697 -0
  1372. vllm/v1/worker/kv_connector_model_runner_mixin.py +122 -0
  1373. vllm/v1/worker/lora_model_runner_mixin.py +192 -0
  1374. vllm/v1/worker/tpu_input_batch.py +585 -0
  1375. vllm/v1/worker/tpu_model_runner.py +1947 -0
  1376. vllm/v1/worker/tpu_worker.py +340 -0
  1377. vllm/v1/worker/utils.py +290 -0
  1378. vllm/v1/worker/worker_base.py +65 -0
  1379. vllm/v1/worker/xpu_model_runner.py +53 -0
  1380. vllm/v1/worker/xpu_worker.py +179 -0
  1381. vllm/version.py +41 -0
  1382. vllm/vllm_flash_attn/.gitkeep +0 -0
  1383. vllm/worker/__init__.py +0 -0
  1384. vllm/worker/cache_engine.py +145 -0
  1385. vllm/worker/enc_dec_model_runner.py +553 -0
  1386. vllm/worker/model_runner.py +2016 -0
  1387. vllm/worker/model_runner_base.py +307 -0
  1388. vllm/worker/utils.py +49 -0
  1389. vllm/worker/worker.py +670 -0
  1390. vllm/worker/worker_base.py +651 -0
  1391. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/METADATA +326 -0
  1392. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/RECORD +1395 -0
  1393. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/WHEEL +5 -0
  1394. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/entry_points.txt +5 -0
  1395. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2590 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Adapted from
5
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
6
+ import json
7
+ import time
8
+ from http import HTTPStatus
9
+ from typing import (Annotated, Any, ClassVar, Generic, Literal, Optional,
10
+ TypeVar, Union)
11
+
12
+ import regex as re
13
+ import torch
14
+ from fastapi import HTTPException, UploadFile
15
+ # yapf: disable
16
+ from openai.types.chat.chat_completion_audio import (
17
+ ChatCompletionAudio as OpenAIChatCompletionAudio)
18
+ from openai.types.chat.chat_completion_message import (
19
+ Annotation as OpenAIAnnotation)
20
+ # yapf: enable
21
+ from openai.types.responses import (ResponseFunctionToolCall,
22
+ ResponseInputItemParam, ResponseOutputItem,
23
+ ResponsePrompt, ResponseReasoningItem,
24
+ ResponseStatus)
25
+
26
+ # Backward compatibility for OpenAI client versions
27
+ try: # For older openai versions (< 1.100.0)
28
+ from openai.types.responses import ResponseTextConfig
29
+ except ImportError: # For newer openai versions (>= 1.100.0)
30
+ from openai.types.responses import (ResponseFormatTextConfig as
31
+ ResponseTextConfig)
32
+
33
+ from openai.types.responses.response import ToolChoice
34
+ from openai.types.responses.tool import Tool
35
+ from openai.types.shared import Metadata, Reasoning
36
+ from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
37
+ ValidationInfo, field_validator, model_validator)
38
+ from typing_extensions import TypeAlias
39
+
40
+ from vllm import envs
41
+ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
42
+ make_tool_call_id)
43
+ from vllm.entrypoints.score_utils import (ScoreContentPartParam,
44
+ ScoreMultiModalParam)
45
+ from vllm.logger import init_logger
46
+ from vllm.logprobs import Logprob
47
+ from vllm.pooling_params import PoolingParams
48
+ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
49
+ RequestOutputKind, SamplingParams)
50
+ from vllm.utils import random_uuid, resolve_obj_by_qualname
51
+
52
+ logger = init_logger(__name__)
53
+
54
+ _LONG_INFO = torch.iinfo(torch.long)
55
+
56
+
57
+ class OpenAIBaseModel(BaseModel):
58
+ # OpenAI API does allow extra fields
59
+ model_config = ConfigDict(extra="allow")
60
+
61
+ # Cache class field names
62
+ field_names: ClassVar[Optional[set[str]]] = None
63
+
64
+ @model_validator(mode="wrap")
65
+ @classmethod
66
+ def __log_extra_fields__(cls, data, handler):
67
+ result = handler(data)
68
+ if not isinstance(data, dict):
69
+ return result
70
+ field_names = cls.field_names
71
+ if field_names is None:
72
+ # Get all class field names and their potential aliases
73
+ field_names = set()
74
+ for field_name, field in cls.model_fields.items():
75
+ field_names.add(field_name)
76
+ if alias := getattr(field, "alias", None):
77
+ field_names.add(alias)
78
+ cls.field_names = field_names
79
+
80
+ # Compare against both field names and aliases
81
+ if any(k not in field_names for k in data):
82
+ logger.warning(
83
+ "The following fields were present in the request "
84
+ "but ignored: %s",
85
+ data.keys() - field_names,
86
+ )
87
+ return result
88
+
89
+
90
+ class ErrorInfo(OpenAIBaseModel):
91
+ message: str
92
+ type: str
93
+ param: Optional[str] = None
94
+ code: int
95
+
96
+
97
+ class ErrorResponse(OpenAIBaseModel):
98
+ error: ErrorInfo
99
+
100
+
101
+ class ModelPermission(OpenAIBaseModel):
102
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
103
+ object: str = "model_permission"
104
+ created: int = Field(default_factory=lambda: int(time.time()))
105
+ allow_create_engine: bool = False
106
+ allow_sampling: bool = True
107
+ allow_logprobs: bool = True
108
+ allow_search_indices: bool = False
109
+ allow_view: bool = True
110
+ allow_fine_tuning: bool = False
111
+ organization: str = "*"
112
+ group: Optional[str] = None
113
+ is_blocking: bool = False
114
+
115
+
116
+ class ModelCard(OpenAIBaseModel):
117
+ id: str
118
+ object: str = "model"
119
+ created: int = Field(default_factory=lambda: int(time.time()))
120
+ owned_by: str = "vllm"
121
+ root: Optional[str] = None
122
+ parent: Optional[str] = None
123
+ max_model_len: Optional[int] = None
124
+ permission: list[ModelPermission] = Field(default_factory=list)
125
+
126
+
127
+ class ModelList(OpenAIBaseModel):
128
+ object: str = "list"
129
+ data: list[ModelCard] = Field(default_factory=list)
130
+
131
+
132
+ class PromptTokenUsageInfo(OpenAIBaseModel):
133
+ cached_tokens: Optional[int] = None
134
+
135
+
136
+ class UsageInfo(OpenAIBaseModel):
137
+ prompt_tokens: int = 0
138
+ total_tokens: int = 0
139
+ completion_tokens: Optional[int] = 0
140
+ prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
141
+
142
+
143
+ class RequestResponseMetadata(BaseModel):
144
+ request_id: str
145
+ final_usage_info: Optional[UsageInfo] = None
146
+
147
+
148
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
149
+ name: str
150
+ description: Optional[str] = None
151
+ # schema is the field in openai but that causes conflicts with pydantic so
152
+ # instead use json_schema with an alias
153
+ json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
154
+ strict: Optional[bool] = None
155
+
156
+
157
+ class StructuralTag(OpenAIBaseModel):
158
+ begin: str
159
+ # schema is the field, but that causes conflicts with pydantic so
160
+ # instead use structural_tag_schema with an alias
161
+ structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
162
+ alias="schema")
163
+ end: str
164
+
165
+
166
+ class StructuralTagResponseFormat(OpenAIBaseModel):
167
+ type: Literal["structural_tag"]
168
+ structures: list[StructuralTag]
169
+ triggers: list[str]
170
+
171
+
172
+ class ResponseFormat(OpenAIBaseModel):
173
+ # type must be "json_schema", "json_object", or "text"
174
+ type: Literal["text", "json_object", "json_schema"]
175
+ json_schema: Optional[JsonSchemaResponseFormat] = None
176
+
177
+
178
+ AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
179
+
180
+
181
+ class StreamOptions(OpenAIBaseModel):
182
+ include_usage: Optional[bool] = True
183
+ continuous_usage_stats: Optional[bool] = False
184
+
185
+
186
+ class FunctionDefinition(OpenAIBaseModel):
187
+ name: str
188
+ description: Optional[str] = None
189
+ parameters: Optional[dict[str, Any]] = None
190
+
191
+
192
+ class ChatCompletionToolsParam(OpenAIBaseModel):
193
+ type: Literal["function"] = "function"
194
+ function: FunctionDefinition
195
+
196
+
197
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
198
+ name: str
199
+
200
+
201
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
202
+ function: ChatCompletionNamedFunction
203
+ type: Literal["function"] = "function"
204
+
205
+
206
+ # extra="forbid" is a workaround to have kwargs as a field,
207
+ # see https://github.com/pydantic/pydantic/issues/3125
208
+ class LogitsProcessorConstructor(BaseModel):
209
+ qualname: str
210
+ args: Optional[list[Any]] = None
211
+ kwargs: Optional[dict[str, Any]] = None
212
+
213
+ model_config = ConfigDict(extra="forbid")
214
+
215
+
216
+ LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
217
+
218
+
219
+ def get_logits_processors(processors: Optional[LogitsProcessors],
220
+ pattern: Optional[str]) -> Optional[list[Any]]:
221
+ if processors and pattern:
222
+ logits_processors = []
223
+ for processor in processors:
224
+ qualname = processor if isinstance(processor,
225
+ str) else processor.qualname
226
+ if not re.match(pattern, qualname):
227
+ raise ValueError(
228
+ f"Logits processor '{qualname}' is not allowed by this "
229
+ "server. See --logits-processor-pattern engine argument "
230
+ "for more information.")
231
+ try:
232
+ logits_processor = resolve_obj_by_qualname(qualname)
233
+ except Exception as e:
234
+ raise ValueError(
235
+ f"Logits processor '{qualname}' could not be resolved: {e}"
236
+ ) from e
237
+ if isinstance(processor, LogitsProcessorConstructor):
238
+ logits_processor = logits_processor(*processor.args or [],
239
+ **processor.kwargs or {})
240
+ logits_processors.append(logits_processor)
241
+ return logits_processors
242
+ elif processors:
243
+ raise ValueError(
244
+ "The `logits_processors` argument is not supported by this "
245
+ "server. See --logits-processor-pattern engine argugment "
246
+ "for more information.")
247
+ return None
248
+
249
+
250
+ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
251
+ ResponseReasoningItem,
252
+ ResponseFunctionToolCall]
253
+
254
+
255
+ class ResponsesRequest(OpenAIBaseModel):
256
+ # Ordered by official OpenAI API documentation
257
+ # https://platform.openai.com/docs/api-reference/responses/create
258
+ background: Optional[bool] = False
259
+ include: Optional[list[
260
+ Literal[
261
+ "code_interpreter_call.outputs",
262
+ "computer_call_output.output.image_url",
263
+ "file_search_call.results",
264
+ "message.input_image.image_url",
265
+ "message.output_text.logprobs",
266
+ "reasoning.encrypted_content",
267
+ ],
268
+ ]] = None
269
+ input: Union[str, list[ResponseInputOutputItem]]
270
+ instructions: Optional[str] = None
271
+ max_output_tokens: Optional[int] = None
272
+ max_tool_calls: Optional[int] = None
273
+ metadata: Optional[Metadata] = None
274
+ model: Optional[str] = None
275
+ parallel_tool_calls: Optional[bool] = True
276
+ previous_response_id: Optional[str] = None
277
+ prompt: Optional[ResponsePrompt] = None
278
+ reasoning: Optional[Reasoning] = None
279
+ service_tier: Literal["auto", "default", "flex", "scale",
280
+ "priority"] = "auto"
281
+ store: Optional[bool] = True
282
+ stream: Optional[bool] = False
283
+ temperature: Optional[float] = None
284
+ text: Optional[ResponseTextConfig] = None
285
+ tool_choice: ToolChoice = "auto"
286
+ tools: list[Tool] = Field(default_factory=list)
287
+ top_logprobs: Optional[int] = 0
288
+ top_p: Optional[float] = None
289
+ truncation: Optional[Literal["auto", "disabled"]] = "disabled"
290
+ user: Optional[str] = None
291
+
292
+ # --8<-- [start:responses-extra-params]
293
+ request_id: str = Field(
294
+ default_factory=lambda: f"resp_{random_uuid()}",
295
+ description=(
296
+ "The request_id related to this request. If the caller does "
297
+ "not set it, a random_uuid will be generated. This id is used "
298
+ "through out the inference process and return in response."),
299
+ )
300
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
301
+ default=None,
302
+ description=("Additional kwargs to pass to the HF processor."),
303
+ )
304
+ priority: int = Field(
305
+ default=0,
306
+ description=(
307
+ "The priority of the request (lower means earlier handling; "
308
+ "default: 0). Any priority other than 0 will raise an error "
309
+ "if the served model does not use priority scheduling."),
310
+ )
311
+ cache_salt: Optional[str] = Field(
312
+ default=None,
313
+ description=(
314
+ "If specified, the prefix cache will be salted with the provided "
315
+ "string to prevent an attacker to guess prompts in multi-user "
316
+ "environments. The salt should be random, protected from "
317
+ "access by 3rd parties, and long enough to be "
318
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
319
+ "to 256 bit). Not supported by vLLM engine V0."))
320
+ # --8<-- [end:responses-extra-params]
321
+
322
+ _DEFAULT_SAMPLING_PARAMS = {
323
+ "temperature": 1.0,
324
+ "top_p": 1.0,
325
+ }
326
+
327
+ def to_sampling_params(
328
+ self,
329
+ default_max_tokens: int,
330
+ default_sampling_params: Optional[dict] = None,
331
+ ) -> SamplingParams:
332
+ if self.max_output_tokens is None:
333
+ max_tokens = default_max_tokens
334
+ else:
335
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
336
+
337
+ default_sampling_params = default_sampling_params or {}
338
+ if (temperature := self.temperature) is None:
339
+ temperature = default_sampling_params.get(
340
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
341
+ if (top_p := self.top_p) is None:
342
+ top_p = default_sampling_params.get(
343
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
344
+ stop_token_ids = default_sampling_params.get("stop_token_ids")
345
+
346
+ # Structured output
347
+ guided_decoding = None
348
+ if self.text is not None and self.text.format is not None:
349
+ response_format = self.text.format
350
+ if response_format.type == "json_schema":
351
+ guided_decoding = GuidedDecodingParams.from_optional(
352
+ json=response_format.schema_)
353
+ elif response_format.type == "json_object":
354
+ raise NotImplementedError("json_object is not supported")
355
+
356
+ # TODO: add more parameters
357
+ return SamplingParams.from_optional(
358
+ temperature=temperature,
359
+ top_p=top_p,
360
+ max_tokens=max_tokens,
361
+ logprobs=self.top_logprobs
362
+ if self.is_include_output_logprobs() else None,
363
+ stop_token_ids=stop_token_ids,
364
+ output_kind=(RequestOutputKind.DELTA
365
+ if self.stream else RequestOutputKind.FINAL_ONLY),
366
+ guided_decoding=guided_decoding,
367
+ )
368
+
369
+ def is_include_output_logprobs(self) -> bool:
370
+ """Check if the request includes output logprobs."""
371
+ if self.include is None:
372
+ return False
373
+ return isinstance(
374
+ self.include,
375
+ list) and "message.output_text.logprobs" in self.include
376
+
377
+ @model_validator(mode="before")
378
+ def validate_background(cls, data):
379
+ if not data.get("background"):
380
+ return data
381
+ if not data.get("store", True):
382
+ raise ValueError(
383
+ "background can only be used when `store` is true")
384
+ return data
385
+
386
+ @model_validator(mode="before")
387
+ def validate_prompt(cls, data):
388
+ if data.get("prompt") is not None:
389
+ raise ValueError("prompt template is not supported")
390
+ return data
391
+
392
+ @model_validator(mode="before")
393
+ def check_cache_salt_support(cls, data):
394
+ if data.get("cache_salt") is not None:
395
+ if not envs.VLLM_USE_V1:
396
+ raise ValueError(
397
+ "Parameter 'cache_salt' is not supported with "
398
+ "this instance of vLLM, which uses engine V0.")
399
+ if not isinstance(data["cache_salt"],
400
+ str) or not data["cache_salt"]:
401
+ raise ValueError("Parameter 'cache_salt' must be a "
402
+ "non-empty string if provided.")
403
+ return data
404
+
405
+
406
+ class ChatCompletionRequest(OpenAIBaseModel):
407
+ # Ordered by official OpenAI API documentation
408
+ # https://platform.openai.com/docs/api-reference/chat/create
409
+ messages: list[ChatCompletionMessageParam]
410
+ model: Optional[str] = None
411
+ frequency_penalty: Optional[float] = 0.0
412
+ logit_bias: Optional[dict[str, float]] = None
413
+ logprobs: Optional[bool] = False
414
+ top_logprobs: Optional[int] = 0
415
+ max_tokens: Optional[int] = Field(
416
+ default=None,
417
+ deprecated=
418
+ 'max_tokens is deprecated in favor of the max_completion_tokens field')
419
+ max_completion_tokens: Optional[int] = None
420
+ n: Optional[int] = 1
421
+ presence_penalty: Optional[float] = 0.0
422
+ response_format: Optional[AnyResponseFormat] = None
423
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
424
+ stop: Optional[Union[str, list[str]]] = []
425
+ stream: Optional[bool] = False
426
+ stream_options: Optional[StreamOptions] = None
427
+ temperature: Optional[float] = None
428
+ top_p: Optional[float] = None
429
+ tools: Optional[list[ChatCompletionToolsParam]] = None
430
+ tool_choice: Optional[Union[
431
+ Literal["none"],
432
+ Literal["auto"],
433
+ Literal["required"],
434
+ ChatCompletionNamedToolChoiceParam,
435
+ ]] = "none"
436
+ reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
437
+ include_reasoning: bool = True
438
+
439
+ # NOTE this will be ignored by vLLM -- the model determines the behavior
440
+ parallel_tool_calls: Optional[bool] = False
441
+ user: Optional[str] = None
442
+
443
+ # --8<-- [start:chat-completion-sampling-params]
444
+ best_of: Optional[int] = None
445
+ use_beam_search: bool = False
446
+ top_k: Optional[int] = None
447
+ min_p: Optional[float] = None
448
+ repetition_penalty: Optional[float] = None
449
+ length_penalty: float = 1.0
450
+ stop_token_ids: Optional[list[int]] = []
451
+ include_stop_str_in_output: bool = False
452
+ ignore_eos: bool = False
453
+ min_tokens: int = 0
454
+ skip_special_tokens: bool = True
455
+ spaces_between_special_tokens: bool = True
456
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
457
+ prompt_logprobs: Optional[int] = None
458
+ allowed_token_ids: Optional[list[int]] = None
459
+ bad_words: list[str] = Field(default_factory=list)
460
+ # --8<-- [end:chat-completion-sampling-params]
461
+
462
+ # --8<-- [start:chat-completion-extra-params]
463
+ echo: bool = Field(
464
+ default=False,
465
+ description=(
466
+ "If true, the new message will be prepended with the last message "
467
+ "if they belong to the same role."),
468
+ )
469
+ add_generation_prompt: bool = Field(
470
+ default=True,
471
+ description=
472
+ ("If true, the generation prompt will be added to the chat template. "
473
+ "This is a parameter used by chat template in tokenizer config of the "
474
+ "model."),
475
+ )
476
+ continue_final_message: bool = Field(
477
+ default=False,
478
+ description=
479
+ ("If this is set, the chat will be formatted so that the final "
480
+ "message in the chat is open-ended, without any EOS tokens. The "
481
+ "model will continue this message rather than starting a new one. "
482
+ "This allows you to \"prefill\" part of the model's response for it. "
483
+ "Cannot be used at the same time as `add_generation_prompt`."),
484
+ )
485
+ add_special_tokens: bool = Field(
486
+ default=False,
487
+ description=(
488
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
489
+ "on top of what is added by the chat template. "
490
+ "For most models, the chat template takes care of adding the "
491
+ "special tokens so this should be set to false (as is the "
492
+ "default)."),
493
+ )
494
+ documents: Optional[list[dict[str, str]]] = Field(
495
+ default=None,
496
+ description=
497
+ ("A list of dicts representing documents that will be accessible to "
498
+ "the model if it is performing RAG (retrieval-augmented generation)."
499
+ " If the template does not support RAG, this argument will have no "
500
+ "effect. We recommend that each document should be a dict containing "
501
+ "\"title\" and \"text\" keys."),
502
+ )
503
+ chat_template: Optional[str] = Field(
504
+ default=None,
505
+ description=(
506
+ "A Jinja template to use for this conversion. "
507
+ "As of transformers v4.44, default chat template is no longer "
508
+ "allowed, so you must provide a chat template if the tokenizer "
509
+ "does not define one."),
510
+ )
511
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
512
+ default=None,
513
+ description=(
514
+ "Additional keyword args to pass to the template renderer. "
515
+ "Will be accessible by the chat template."),
516
+ )
517
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
518
+ default=None,
519
+ description=("Additional kwargs to pass to the HF processor."),
520
+ )
521
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
522
+ default=None,
523
+ description=("If specified, the output will follow the JSON schema."),
524
+ )
525
+ guided_regex: Optional[str] = Field(
526
+ default=None,
527
+ description=(
528
+ "If specified, the output will follow the regex pattern."),
529
+ )
530
+ guided_choice: Optional[list[str]] = Field(
531
+ default=None,
532
+ description=(
533
+ "If specified, the output will be exactly one of the choices."),
534
+ )
535
+ guided_grammar: Optional[str] = Field(
536
+ default=None,
537
+ description=(
538
+ "If specified, the output will follow the context free grammar."),
539
+ )
540
+ structural_tag: Optional[str] = Field(
541
+ default=None,
542
+ description=(
543
+ "If specified, the output will follow the structural tag schema."),
544
+ )
545
+ guided_decoding_backend: Optional[str] = Field(
546
+ default=None,
547
+ description=(
548
+ "If specified, will override the default guided decoding backend "
549
+ "of the server for this specific request. If set, must be either "
550
+ "'outlines' / 'lm-format-enforcer'"),
551
+ )
552
+ guided_whitespace_pattern: Optional[str] = Field(
553
+ default=None,
554
+ description=(
555
+ "If specified, will override the default whitespace pattern "
556
+ "for guided json decoding."),
557
+ )
558
+ priority: int = Field(
559
+ default=0,
560
+ description=(
561
+ "The priority of the request (lower means earlier handling; "
562
+ "default: 0). Any priority other than 0 will raise an error "
563
+ "if the served model does not use priority scheduling."),
564
+ )
565
+ request_id: str = Field(
566
+ default_factory=lambda: f"{random_uuid()}",
567
+ description=(
568
+ "The request_id related to this request. If the caller does "
569
+ "not set it, a random_uuid will be generated. This id is used "
570
+ "through out the inference process and return in response."),
571
+ )
572
+ logits_processors: Optional[LogitsProcessors] = Field(
573
+ default=None,
574
+ description=(
575
+ "A list of either qualified names of logits processors, or "
576
+ "constructor objects, to apply when sampling. A constructor is "
577
+ "a JSON object with a required 'qualname' field specifying the "
578
+ "qualified name of the processor class/factory, and optional "
579
+ "'args' and 'kwargs' fields containing positional and keyword "
580
+ "arguments. For example: {'qualname': "
581
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
582
+ "{'param': 'value'}}."))
583
+ return_tokens_as_token_ids: Optional[bool] = Field(
584
+ default=None,
585
+ description=(
586
+ "If specified with 'logprobs', tokens are represented "
587
+ " as strings of the form 'token_id:{token_id}' so that tokens "
588
+ "that are not JSON-encodable can be identified."))
589
+ return_token_ids: Optional[bool] = Field(
590
+ default=None,
591
+ description=(
592
+ "If specified, the result will include token IDs alongside the "
593
+ "generated text. In streaming mode, prompt_token_ids is included "
594
+ "only in the first chunk, and token_ids contains the delta tokens "
595
+ "for each chunk. This is useful for debugging or when you "
596
+ "need to map generated text back to input tokens."))
597
+ cache_salt: Optional[str] = Field(
598
+ default=None,
599
+ description=(
600
+ "If specified, the prefix cache will be salted with the provided "
601
+ "string to prevent an attacker to guess prompts in multi-user "
602
+ "environments. The salt should be random, protected from "
603
+ "access by 3rd parties, and long enough to be "
604
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
605
+ "to 256 bit). Not supported by vLLM engine V0."))
606
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
607
+ default=None,
608
+ description="KVTransfer parameters used for disaggregated serving.")
609
+
610
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
611
+ default=None,
612
+ description=("Additional request parameters with string or "
613
+ "numeric values, used by custom extensions."),
614
+ )
615
+
616
+ # --8<-- [end:chat-completion-extra-params]
617
+
618
+ # Default sampling parameters for chat completion requests
619
+ _DEFAULT_SAMPLING_PARAMS: dict = {
620
+ "repetition_penalty": 1.0,
621
+ "temperature": 1.0,
622
+ "top_p": 1.0,
623
+ "top_k": 0,
624
+ "min_p": 0.0,
625
+ }
626
+
627
+ def to_beam_search_params(
628
+ self, max_tokens: int,
629
+ default_sampling_params: dict) -> BeamSearchParams:
630
+
631
+ n = self.n if self.n is not None else 1
632
+ if (temperature := self.temperature) is None:
633
+ temperature = default_sampling_params.get(
634
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
635
+
636
+ return BeamSearchParams(
637
+ beam_width=n,
638
+ max_tokens=max_tokens,
639
+ ignore_eos=self.ignore_eos,
640
+ temperature=temperature,
641
+ length_penalty=self.length_penalty,
642
+ include_stop_str_in_output=self.include_stop_str_in_output,
643
+ )
644
+
645
+ def to_sampling_params(
646
+ self,
647
+ max_tokens: int,
648
+ logits_processor_pattern: Optional[str],
649
+ default_sampling_params: dict,
650
+ ) -> SamplingParams:
651
+
652
+ # Default parameters
653
+ if (repetition_penalty := self.repetition_penalty) is None:
654
+ repetition_penalty = default_sampling_params.get(
655
+ "repetition_penalty",
656
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
657
+ )
658
+ if (temperature := self.temperature) is None:
659
+ temperature = default_sampling_params.get(
660
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
661
+ if (top_p := self.top_p) is None:
662
+ top_p = default_sampling_params.get(
663
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
664
+ if (top_k := self.top_k) is None:
665
+ top_k = default_sampling_params.get(
666
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
667
+ if (min_p := self.min_p) is None:
668
+ min_p = default_sampling_params.get(
669
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
670
+
671
+ prompt_logprobs = self.prompt_logprobs
672
+ if prompt_logprobs is None and self.echo:
673
+ prompt_logprobs = self.top_logprobs
674
+
675
+ guided_json_object = None
676
+ if self.response_format is not None:
677
+ if self.response_format.type == "json_object":
678
+ guided_json_object = True
679
+ elif self.response_format.type == "json_schema":
680
+ json_schema = self.response_format.json_schema
681
+ assert json_schema is not None
682
+ self.guided_json = json_schema.json_schema
683
+ elif self.response_format.type == "structural_tag":
684
+ structural_tag = self.response_format
685
+ assert structural_tag is not None and isinstance(
686
+ structural_tag, StructuralTagResponseFormat)
687
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
688
+ self.structural_tag = json.dumps(s_tag_obj)
689
+
690
+ guided_decoding = GuidedDecodingParams.from_optional(
691
+ json=self._get_guided_json_from_tool() or self.guided_json,
692
+ regex=self.guided_regex,
693
+ choice=self.guided_choice,
694
+ grammar=self.guided_grammar,
695
+ json_object=guided_json_object,
696
+ backend=self.guided_decoding_backend,
697
+ whitespace_pattern=self.guided_whitespace_pattern,
698
+ structural_tag=self.structural_tag,
699
+ )
700
+
701
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
702
+ if self.kv_transfer_params:
703
+ # Pass in kv_transfer_params via extra_args
704
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
705
+ return SamplingParams.from_optional(
706
+ n=self.n,
707
+ best_of=self.best_of,
708
+ presence_penalty=self.presence_penalty,
709
+ frequency_penalty=self.frequency_penalty,
710
+ repetition_penalty=repetition_penalty,
711
+ temperature=temperature,
712
+ top_p=top_p,
713
+ top_k=top_k,
714
+ min_p=min_p,
715
+ seed=self.seed,
716
+ stop=self.stop,
717
+ stop_token_ids=self.stop_token_ids,
718
+ logprobs=self.top_logprobs if self.logprobs else None,
719
+ prompt_logprobs=prompt_logprobs,
720
+ ignore_eos=self.ignore_eos,
721
+ max_tokens=max_tokens,
722
+ min_tokens=self.min_tokens,
723
+ skip_special_tokens=self.skip_special_tokens,
724
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
725
+ logits_processors=get_logits_processors(self.logits_processors,
726
+ logits_processor_pattern),
727
+ include_stop_str_in_output=self.include_stop_str_in_output,
728
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
729
+ output_kind=RequestOutputKind.DELTA if self.stream \
730
+ else RequestOutputKind.FINAL_ONLY,
731
+ guided_decoding=guided_decoding,
732
+ logit_bias=self.logit_bias,
733
+ bad_words= self.bad_words,
734
+ allowed_token_ids=self.allowed_token_ids,
735
+ extra_args=extra_args or None,
736
+ )
737
+
738
+ def _get_guided_json_from_tool(
739
+ self) -> Optional[Union[str, dict, BaseModel]]:
740
+ # user has chosen to not use any tool
741
+ if self.tool_choice == "none" or self.tools is None:
742
+ return None
743
+
744
+ # user has chosen to use a named tool
745
+ if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
746
+ tool_name = self.tool_choice.function.name
747
+ tools = {tool.function.name: tool.function for tool in self.tools}
748
+ if tool_name not in tools:
749
+ raise ValueError(
750
+ f"Tool '{tool_name}' has not been passed in `tools`.")
751
+ tool = tools[tool_name]
752
+ return tool.parameters
753
+
754
+ if self.tool_choice == "required":
755
+ # Pydantic schema generation cannot be used since the JSON schema
756
+ # has to be constructed for a specific instantiation of a tool list
757
+ # so that parameters of a function are correctly generated
758
+ # based on the chosen function name
759
+ def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
760
+ return {
761
+ "properties": {
762
+ "name": {
763
+ "type": "string",
764
+ "enum": [tool.function.name]
765
+ },
766
+ # parameters are always generated as '{}' in the final
767
+ # output if they are missing from the request
768
+ # (i.e. are None or '{}') so the schema is
769
+ # updated to produce an empty object in that case
770
+ "parameters": tool.function.parameters
771
+ if tool.function.parameters else {
772
+ "type": "object",
773
+ "properties": {}
774
+ }
775
+ },
776
+ "required": ["name", "parameters"]
777
+ }
778
+
779
+ def get_tool_schema_defs(
780
+ tools: list[ChatCompletionToolsParam]) -> dict:
781
+ all_defs = dict[str, dict[str, Any]]()
782
+ for tool in tools:
783
+ if tool.function.parameters is None:
784
+ continue
785
+ defs = tool.function.parameters.pop("$defs", {})
786
+ for def_name, def_schema in defs.items():
787
+ if def_name in all_defs and all_defs[
788
+ def_name] != def_schema:
789
+ raise ValueError(
790
+ f"Tool definition '{def_name}' has "
791
+ "multiple schemas, which is not "
792
+ "supported.")
793
+ else:
794
+ all_defs[def_name] = def_schema
795
+ return all_defs
796
+
797
+ json_schema = {
798
+ "type": "array",
799
+ "minItems": 1,
800
+ "items": {
801
+ "type": "object",
802
+ "anyOf": [get_tool_schema(tool) for tool in self.tools]
803
+ }
804
+ }
805
+ json_schema_defs = get_tool_schema_defs(self.tools)
806
+ if json_schema_defs:
807
+ json_schema["$defs"] = json_schema_defs
808
+ return json_schema
809
+
810
+ return None
811
+
812
+ @model_validator(mode="before")
813
+ @classmethod
814
+ def validate_stream_options(cls, data):
815
+ if data.get("stream_options") and not data.get("stream"):
816
+ raise ValueError(
817
+ "Stream options can only be defined when `stream=True`.")
818
+
819
+ return data
820
+
821
+ @model_validator(mode="before")
822
+ @classmethod
823
+ def check_logprobs(cls, data):
824
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
825
+ if data.get("stream") and prompt_logprobs > 0:
826
+ raise ValueError(
827
+ "`prompt_logprobs` are not available when `stream=True`.")
828
+
829
+ if prompt_logprobs < 0:
830
+ raise ValueError("`prompt_logprobs` must be a positive value.")
831
+
832
+ if (top_logprobs := data.get("top_logprobs")) is not None:
833
+ if top_logprobs < 0:
834
+ raise ValueError("`top_logprobs` must be a positive value.")
835
+
836
+ if top_logprobs > 0 and not data.get("logprobs"):
837
+ raise ValueError(
838
+ "when using `top_logprobs`, `logprobs` must be set to true."
839
+ )
840
+
841
+ return data
842
+
843
+ @model_validator(mode="before")
844
+ @classmethod
845
+ def check_guided_decoding_count(cls, data):
846
+ if isinstance(data, ValueError):
847
+ raise data
848
+
849
+ guide_count = sum([
850
+ "guided_json" in data and data["guided_json"] is not None,
851
+ "guided_regex" in data and data["guided_regex"] is not None,
852
+ "guided_choice" in data and data["guided_choice"] is not None
853
+ ])
854
+ # you can only use one kind of guided decoding
855
+ if guide_count > 1:
856
+ raise ValueError(
857
+ "You can only use one kind of guided decoding "
858
+ "('guided_json', 'guided_regex' or 'guided_choice').")
859
+ # you can only either use guided decoding or tools, not both
860
+ if guide_count > 1 and data.get("tool_choice", "none") not in (
861
+ "none",
862
+ "auto",
863
+ "required",
864
+ ):
865
+ raise ValueError(
866
+ "You can only either use guided decoding or tools, not both.")
867
+ return data
868
+
869
+ @model_validator(mode="before")
870
+ @classmethod
871
+ def check_tool_usage(cls, data):
872
+
873
+ # if "tool_choice" is not specified but tools are provided,
874
+ # default to "auto" tool_choice
875
+ if "tool_choice" not in data and data.get("tools"):
876
+ data["tool_choice"] = "auto"
877
+
878
+ # if "tool_choice" is "none" -- no validation is needed for tools
879
+ if "tool_choice" in data and data["tool_choice"] == "none":
880
+ return data
881
+
882
+ # if "tool_choice" is specified -- validation
883
+ if "tool_choice" in data and data["tool_choice"] is not None:
884
+
885
+ # ensure that if "tool choice" is specified, tools are present
886
+ if "tools" not in data or data["tools"] is None:
887
+ raise ValueError(
888
+ "When using `tool_choice`, `tools` must be set.")
889
+
890
+ # make sure that tool choice is either a named tool
891
+ # OR that it's set to "auto" or "required"
892
+ if data["tool_choice"] not in [
893
+ "auto", "required"
894
+ ] and not isinstance(data["tool_choice"], dict):
895
+ raise ValueError(
896
+ f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
897
+ 'Only named tools, "none", "auto" or "required" '\
898
+ 'are supported.'
899
+ )
900
+
901
+ # if tool_choice is "required" but the "tools" list is empty,
902
+ # override the data to behave like "none" to align with
903
+ # OpenAI’s behavior.
904
+ if data["tool_choice"] == "required" and isinstance(
905
+ data["tools"], list) and len(data["tools"]) == 0:
906
+ data["tool_choice"] = "none"
907
+ del data["tools"]
908
+ return data
909
+
910
+ # ensure that if "tool_choice" is specified as an object,
911
+ # it matches a valid tool
912
+ correct_usage_message = 'Correct usage: `{"type": "function",' \
913
+ ' "function": {"name": "my_function"}}`'
914
+ if isinstance(data["tool_choice"], dict):
915
+ valid_tool = False
916
+ function = data["tool_choice"].get("function")
917
+ if not isinstance(function, dict):
918
+ raise ValueError(
919
+ f"Invalid value for `function`: `{function}` in "
920
+ f"`tool_choice`! {correct_usage_message}")
921
+ if "name" not in function:
922
+ raise ValueError(f"Expected field `name` in `function` in "
923
+ f"`tool_choice`! {correct_usage_message}")
924
+ function_name = function["name"]
925
+ if not isinstance(function_name,
926
+ str) or len(function_name) == 0:
927
+ raise ValueError(
928
+ f"Invalid `name` in `function`: `{function_name}`"
929
+ f" in `tool_choice`! {correct_usage_message}")
930
+ for tool in data["tools"]:
931
+ if tool["function"]["name"] == function_name:
932
+ valid_tool = True
933
+ break
934
+ if not valid_tool:
935
+ raise ValueError(
936
+ "The tool specified in `tool_choice` does not match any"
937
+ " of the specified `tools`")
938
+ return data
939
+
940
+ @model_validator(mode="before")
941
+ @classmethod
942
+ def check_generation_prompt(cls, data):
943
+ if data.get("continue_final_message") and data.get(
944
+ "add_generation_prompt"):
945
+ raise ValueError("Cannot set both `continue_final_message` and "
946
+ "`add_generation_prompt` to True.")
947
+ return data
948
+
949
+ @model_validator(mode="before")
950
+ @classmethod
951
+ def check_cache_salt_support(cls, data):
952
+ if data.get("cache_salt") is not None:
953
+ if not envs.VLLM_USE_V1:
954
+ raise ValueError(
955
+ "Parameter 'cache_salt' is not supported with "
956
+ "this instance of vLLM, which uses engine V0.")
957
+ if not isinstance(data["cache_salt"],
958
+ str) or not data["cache_salt"]:
959
+ raise ValueError("Parameter 'cache_salt' must be a "
960
+ "non-empty string if provided.")
961
+ return data
962
+
963
+
964
+ class CompletionRequest(OpenAIBaseModel):
965
+ # Ordered by official OpenAI API documentation
966
+ # https://platform.openai.com/docs/api-reference/completions/create
967
+ model: Optional[str] = None
968
+ prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
969
+ prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
970
+ best_of: Optional[int] = None
971
+ echo: Optional[bool] = False
972
+ frequency_penalty: Optional[float] = 0.0
973
+ logit_bias: Optional[dict[str, float]] = None
974
+ logprobs: Optional[int] = None
975
+ max_tokens: Optional[int] = 16
976
+ n: int = 1
977
+ presence_penalty: Optional[float] = 0.0
978
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
979
+ stop: Optional[Union[str, list[str]]] = []
980
+ stream: Optional[bool] = False
981
+ stream_options: Optional[StreamOptions] = None
982
+ suffix: Optional[str] = None
983
+ temperature: Optional[float] = None
984
+ top_p: Optional[float] = None
985
+ user: Optional[str] = None
986
+
987
+ # --8<-- [start:completion-sampling-params]
988
+ use_beam_search: bool = False
989
+ top_k: Optional[int] = None
990
+ min_p: Optional[float] = None
991
+ repetition_penalty: Optional[float] = None
992
+ length_penalty: float = 1.0
993
+ stop_token_ids: Optional[list[int]] = []
994
+ include_stop_str_in_output: bool = False
995
+ ignore_eos: bool = False
996
+ min_tokens: int = 0
997
+ skip_special_tokens: bool = True
998
+ spaces_between_special_tokens: bool = True
999
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1000
+ allowed_token_ids: Optional[list[int]] = None
1001
+ prompt_logprobs: Optional[int] = None
1002
+ # --8<-- [end:completion-sampling-params]
1003
+
1004
+ # --8<-- [start:completion-extra-params]
1005
+ add_special_tokens: bool = Field(
1006
+ default=True,
1007
+ description=(
1008
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1009
+ "the prompt."),
1010
+ )
1011
+ response_format: Optional[AnyResponseFormat] = Field(
1012
+ default=None,
1013
+ description=(
1014
+ "Similar to chat completion, this parameter specifies the format "
1015
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
1016
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
1017
+ ),
1018
+ )
1019
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
1020
+ default=None,
1021
+ description="If specified, the output will follow the JSON schema.",
1022
+ )
1023
+ guided_regex: Optional[str] = Field(
1024
+ default=None,
1025
+ description=(
1026
+ "If specified, the output will follow the regex pattern."),
1027
+ )
1028
+ guided_choice: Optional[list[str]] = Field(
1029
+ default=None,
1030
+ description=(
1031
+ "If specified, the output will be exactly one of the choices."),
1032
+ )
1033
+ guided_grammar: Optional[str] = Field(
1034
+ default=None,
1035
+ description=(
1036
+ "If specified, the output will follow the context free grammar."),
1037
+ )
1038
+ guided_decoding_backend: Optional[str] = Field(
1039
+ default=None,
1040
+ description=(
1041
+ "If specified, will override the default guided decoding backend "
1042
+ "of the server for this specific request. If set, must be one of "
1043
+ "'outlines' / 'lm-format-enforcer'"),
1044
+ )
1045
+ guided_whitespace_pattern: Optional[str] = Field(
1046
+ default=None,
1047
+ description=(
1048
+ "If specified, will override the default whitespace pattern "
1049
+ "for guided json decoding."),
1050
+ )
1051
+ priority: int = Field(
1052
+ default=0,
1053
+ description=(
1054
+ "The priority of the request (lower means earlier handling; "
1055
+ "default: 0). Any priority other than 0 will raise an error "
1056
+ "if the served model does not use priority scheduling."),
1057
+ )
1058
+ request_id: str = Field(
1059
+ default_factory=lambda: f"{random_uuid()}",
1060
+ description=(
1061
+ "The request_id related to this request. If the caller does "
1062
+ "not set it, a random_uuid will be generated. This id is used "
1063
+ "through out the inference process and return in response."),
1064
+ )
1065
+ logits_processors: Optional[LogitsProcessors] = Field(
1066
+ default=None,
1067
+ description=(
1068
+ "A list of either qualified names of logits processors, or "
1069
+ "constructor objects, to apply when sampling. A constructor is "
1070
+ "a JSON object with a required 'qualname' field specifying the "
1071
+ "qualified name of the processor class/factory, and optional "
1072
+ "'args' and 'kwargs' fields containing positional and keyword "
1073
+ "arguments. For example: {'qualname': "
1074
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
1075
+ "{'param': 'value'}}."))
1076
+
1077
+ return_tokens_as_token_ids: Optional[bool] = Field(
1078
+ default=None,
1079
+ description=(
1080
+ "If specified with 'logprobs', tokens are represented "
1081
+ " as strings of the form 'token_id:{token_id}' so that tokens "
1082
+ "that are not JSON-encodable can be identified."))
1083
+ return_token_ids: Optional[bool] = Field(
1084
+ default=None,
1085
+ description=(
1086
+ "If specified, the result will include token IDs alongside the "
1087
+ "generated text. In streaming mode, prompt_token_ids is included "
1088
+ "only in the first chunk, and token_ids contains the delta tokens "
1089
+ "for each chunk. This is useful for debugging or when you "
1090
+ "need to map generated text back to input tokens."))
1091
+
1092
+ cache_salt: Optional[str] = Field(
1093
+ default=None,
1094
+ description=(
1095
+ "If specified, the prefix cache will be salted with the provided "
1096
+ "string to prevent an attacker to guess prompts in multi-user "
1097
+ "environments. The salt should be random, protected from "
1098
+ "access by 3rd parties, and long enough to be "
1099
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
1100
+ "to 256 bit). Not supported by vLLM engine V0."))
1101
+
1102
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1103
+ default=None,
1104
+ description="KVTransfer parameters used for disaggregated serving.")
1105
+
1106
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
1107
+ default=None,
1108
+ description=("Additional request parameters with string or "
1109
+ "numeric values, used by custom extensions."),
1110
+ )
1111
+
1112
+ # --8<-- [end:completion-extra-params]
1113
+
1114
+ # Default sampling parameters for completion requests
1115
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1116
+ "repetition_penalty": 1.0,
1117
+ "temperature": 1.0,
1118
+ "top_p": 1.0,
1119
+ "top_k": 0,
1120
+ "min_p": 0.0,
1121
+ }
1122
+
1123
+ def to_beam_search_params(
1124
+ self,
1125
+ max_tokens: int,
1126
+ default_sampling_params: Optional[dict] = None,
1127
+ ) -> BeamSearchParams:
1128
+
1129
+ if default_sampling_params is None:
1130
+ default_sampling_params = {}
1131
+ n = self.n if self.n is not None else 1
1132
+
1133
+ if (temperature := self.temperature) is None:
1134
+ temperature = default_sampling_params.get("temperature", 1.0)
1135
+
1136
+ return BeamSearchParams(
1137
+ beam_width=n,
1138
+ max_tokens=max_tokens,
1139
+ ignore_eos=self.ignore_eos,
1140
+ temperature=temperature,
1141
+ length_penalty=self.length_penalty,
1142
+ include_stop_str_in_output=self.include_stop_str_in_output,
1143
+ )
1144
+
1145
+ def to_sampling_params(
1146
+ self,
1147
+ max_tokens: int,
1148
+ logits_processor_pattern: Optional[str],
1149
+ default_sampling_params: Optional[dict] = None,
1150
+ ) -> SamplingParams:
1151
+
1152
+ if default_sampling_params is None:
1153
+ default_sampling_params = {}
1154
+
1155
+ # Default parameters
1156
+ if (repetition_penalty := self.repetition_penalty) is None:
1157
+ repetition_penalty = default_sampling_params.get(
1158
+ "repetition_penalty",
1159
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
1160
+ )
1161
+ if (temperature := self.temperature) is None:
1162
+ temperature = default_sampling_params.get(
1163
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
1164
+ if (top_p := self.top_p) is None:
1165
+ top_p = default_sampling_params.get(
1166
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
1167
+ if (top_k := self.top_k) is None:
1168
+ top_k = default_sampling_params.get(
1169
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
1170
+ if (min_p := self.min_p) is None:
1171
+ min_p = default_sampling_params.get(
1172
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
1173
+
1174
+ prompt_logprobs = self.prompt_logprobs
1175
+ if prompt_logprobs is None and self.echo:
1176
+ prompt_logprobs = self.logprobs
1177
+
1178
+ echo_without_generation = self.echo and self.max_tokens == 0
1179
+
1180
+ guided_json_object = None
1181
+ if (self.response_format is not None
1182
+ and self.response_format.type == "json_object"):
1183
+ guided_json_object = True
1184
+
1185
+ guided_decoding = GuidedDecodingParams.from_optional(
1186
+ json=self.guided_json,
1187
+ regex=self.guided_regex,
1188
+ choice=self.guided_choice,
1189
+ grammar=self.guided_grammar,
1190
+ json_object=guided_json_object,
1191
+ backend=self.guided_decoding_backend,
1192
+ whitespace_pattern=self.guided_whitespace_pattern,
1193
+ )
1194
+
1195
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
1196
+ if self.kv_transfer_params:
1197
+ # Pass in kv_transfer_params via extra_args
1198
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
1199
+ return SamplingParams.from_optional(
1200
+ n=self.n,
1201
+ best_of=self.best_of,
1202
+ presence_penalty=self.presence_penalty,
1203
+ frequency_penalty=self.frequency_penalty,
1204
+ repetition_penalty=repetition_penalty,
1205
+ temperature=temperature,
1206
+ top_p=top_p,
1207
+ top_k=top_k,
1208
+ min_p=min_p,
1209
+ seed=self.seed,
1210
+ stop=self.stop,
1211
+ stop_token_ids=self.stop_token_ids,
1212
+ logprobs=self.logprobs,
1213
+ ignore_eos=self.ignore_eos,
1214
+ max_tokens=max_tokens if not echo_without_generation else 1,
1215
+ min_tokens=self.min_tokens,
1216
+ prompt_logprobs=prompt_logprobs,
1217
+ skip_special_tokens=self.skip_special_tokens,
1218
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
1219
+ include_stop_str_in_output=self.include_stop_str_in_output,
1220
+ logits_processors=get_logits_processors(self.logits_processors,
1221
+ logits_processor_pattern),
1222
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1223
+ output_kind=RequestOutputKind.DELTA if self.stream \
1224
+ else RequestOutputKind.FINAL_ONLY,
1225
+ guided_decoding=guided_decoding,
1226
+ logit_bias=self.logit_bias,
1227
+ allowed_token_ids=self.allowed_token_ids,
1228
+ extra_args=extra_args or None,
1229
+ )
1230
+
1231
+ @model_validator(mode="before")
1232
+ @classmethod
1233
+ def check_guided_decoding_count(cls, data):
1234
+ guide_count = sum([
1235
+ "guided_json" in data and data["guided_json"] is not None,
1236
+ "guided_regex" in data and data["guided_regex"] is not None,
1237
+ "guided_choice" in data and data["guided_choice"] is not None
1238
+ ])
1239
+ if guide_count > 1:
1240
+ raise ValueError(
1241
+ "You can only use one kind of guided decoding "
1242
+ "('guided_json', 'guided_regex' or 'guided_choice').")
1243
+ return data
1244
+
1245
+ @model_validator(mode="before")
1246
+ @classmethod
1247
+ def check_logprobs(cls, data):
1248
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1249
+ if data.get("stream") and prompt_logprobs > 0:
1250
+ raise ValueError(
1251
+ "`prompt_logprobs` are not available when `stream=True`.")
1252
+
1253
+ if prompt_logprobs < 0:
1254
+ raise ValueError("`prompt_logprobs` must be a positive value.")
1255
+
1256
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1257
+ raise ValueError("`logprobs` must be a positive value.")
1258
+
1259
+ return data
1260
+
1261
+ @model_validator(mode="before")
1262
+ @classmethod
1263
+ def validate_stream_options(cls, data):
1264
+ if data.get("stream_options") and not data.get("stream"):
1265
+ raise ValueError(
1266
+ "Stream options can only be defined when `stream=True`.")
1267
+
1268
+ return data
1269
+
1270
+ @model_validator(mode="before")
1271
+ @classmethod
1272
+ def validate_prompt_and_prompt_embeds(cls, data):
1273
+ prompt = data.get("prompt")
1274
+ prompt_embeds = data.get("prompt_embeds")
1275
+
1276
+ prompt_is_empty = (prompt is None
1277
+ or (isinstance(prompt, str) and prompt == ""))
1278
+ embeds_is_empty = (prompt_embeds is None
1279
+ or (isinstance(prompt_embeds, list)
1280
+ and len(prompt_embeds) == 0))
1281
+
1282
+ if prompt_is_empty and embeds_is_empty:
1283
+ raise ValueError(
1284
+ "Either prompt or prompt_embeds must be provided and non-empty."
1285
+ )
1286
+
1287
+ return data
1288
+
1289
+ @model_validator(mode="before")
1290
+ @classmethod
1291
+ def check_cache_salt_support(cls, data):
1292
+ if data.get("cache_salt") is not None:
1293
+ if not envs.VLLM_USE_V1:
1294
+ raise ValueError(
1295
+ "Parameter 'cache_salt' is not supported with "
1296
+ "this instance of vLLM, which uses engine V0.")
1297
+ if not isinstance(data["cache_salt"],
1298
+ str) or not data["cache_salt"]:
1299
+ raise ValueError("Parameter 'cache_salt' must be a "
1300
+ "non-empty string if provided.")
1301
+ return data
1302
+
1303
+
1304
+ class EmbeddingCompletionRequest(OpenAIBaseModel):
1305
+ # Ordered by official OpenAI API documentation
1306
+ # https://platform.openai.com/docs/api-reference/embeddings
1307
+ model: Optional[str] = None
1308
+ input: Union[list[int], list[list[int]], str, list[str]]
1309
+ encoding_format: Literal["float", "base64"] = "float"
1310
+ dimensions: Optional[int] = None
1311
+ user: Optional[str] = None
1312
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1313
+
1314
+ # --8<-- [start:embedding-extra-params]
1315
+ add_special_tokens: bool = Field(
1316
+ default=True,
1317
+ description=(
1318
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1319
+ "the prompt."),
1320
+ )
1321
+ priority: int = Field(
1322
+ default=0,
1323
+ description=(
1324
+ "The priority of the request (lower means earlier handling; "
1325
+ "default: 0). Any priority other than 0 will raise an error "
1326
+ "if the served model does not use priority scheduling."),
1327
+ )
1328
+ request_id: str = Field(
1329
+ default_factory=lambda: f"{random_uuid()}",
1330
+ description=(
1331
+ "The request_id related to this request. If the caller does "
1332
+ "not set it, a random_uuid will be generated. This id is used "
1333
+ "through out the inference process and return in response."),
1334
+ )
1335
+ normalize: Optional[bool] = None
1336
+
1337
+ # --8<-- [end:embedding-extra-params]
1338
+
1339
+ def to_pooling_params(self):
1340
+ return PoolingParams(
1341
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1342
+ dimensions=self.dimensions,
1343
+ normalize=self.normalize)
1344
+
1345
+
1346
+ class EmbeddingChatRequest(OpenAIBaseModel):
1347
+ model: Optional[str] = None
1348
+ messages: list[ChatCompletionMessageParam]
1349
+
1350
+ encoding_format: Literal["float", "base64"] = "float"
1351
+ dimensions: Optional[int] = None
1352
+ user: Optional[str] = None
1353
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1354
+
1355
+ # --8<-- [start:chat-embedding-extra-params]
1356
+ add_generation_prompt: bool = Field(
1357
+ default=False,
1358
+ description=
1359
+ ("If true, the generation prompt will be added to the chat template. "
1360
+ "This is a parameter used by chat template in tokenizer config of the "
1361
+ "model."),
1362
+ )
1363
+
1364
+ add_special_tokens: bool = Field(
1365
+ default=False,
1366
+ description=(
1367
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1368
+ "on top of what is added by the chat template. "
1369
+ "For most models, the chat template takes care of adding the "
1370
+ "special tokens so this should be set to false (as is the "
1371
+ "default)."),
1372
+ )
1373
+ chat_template: Optional[str] = Field(
1374
+ default=None,
1375
+ description=(
1376
+ "A Jinja template to use for this conversion. "
1377
+ "As of transformers v4.44, default chat template is no longer "
1378
+ "allowed, so you must provide a chat template if the tokenizer "
1379
+ "does not define one."),
1380
+ )
1381
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1382
+ default=None,
1383
+ description=(
1384
+ "Additional keyword args to pass to the template renderer. "
1385
+ "Will be accessible by the chat template."),
1386
+ )
1387
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1388
+ default=None,
1389
+ description=("Additional kwargs to pass to the HF processor."),
1390
+ )
1391
+ priority: int = Field(
1392
+ default=0,
1393
+ description=(
1394
+ "The priority of the request (lower means earlier handling; "
1395
+ "default: 0). Any priority other than 0 will raise an error "
1396
+ "if the served model does not use priority scheduling."),
1397
+ )
1398
+ request_id: str = Field(
1399
+ default_factory=lambda: f"{random_uuid()}",
1400
+ description=(
1401
+ "The request_id related to this request. If the caller does "
1402
+ "not set it, a random_uuid will be generated. This id is used "
1403
+ "through out the inference process and return in response."),
1404
+ )
1405
+ normalize: Optional[bool] = None
1406
+ # --8<-- [end:chat-embedding-extra-params]
1407
+
1408
+ @model_validator(mode="before")
1409
+ @classmethod
1410
+ def check_generation_prompt(cls, data):
1411
+ if data.get("continue_final_message") and data.get(
1412
+ "add_generation_prompt"):
1413
+ raise ValueError("Cannot set both `continue_final_message` and "
1414
+ "`add_generation_prompt` to True.")
1415
+ return data
1416
+
1417
+ def to_pooling_params(self):
1418
+ return PoolingParams(
1419
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1420
+ dimensions=self.dimensions,
1421
+ normalize=self.normalize)
1422
+
1423
+
1424
+ EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
1425
+
1426
+ PoolingCompletionRequest = EmbeddingCompletionRequest
1427
+ PoolingChatRequest = EmbeddingChatRequest
1428
+
1429
+ T = TypeVar("T")
1430
+
1431
+
1432
+ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
1433
+ model: Optional[str] = None
1434
+
1435
+ priority: int = Field(default=0)
1436
+ """
1437
+ The priority of the request (lower means earlier handling;
1438
+ default: 0). Any priority other than 0 will raise an error
1439
+ if the served model does not use priority scheduling.
1440
+ """
1441
+ data: T
1442
+ """
1443
+ When using plugins IOProcessor plugins, the actual input is processed
1444
+ by the plugin itself. Hence, we use a generic type for the request data
1445
+ """
1446
+ softmax: bool = True
1447
+
1448
+ def to_pooling_params(self):
1449
+ return PoolingParams(task="encode", softmax=self.softmax)
1450
+
1451
+
1452
+ class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
1453
+
1454
+ request_id: Optional[str] = None
1455
+ """
1456
+ The request_id associated with this response
1457
+ """
1458
+ created_at: int = Field(default_factory=lambda: int(time.time()))
1459
+
1460
+ data: T
1461
+ """
1462
+ When using plugins IOProcessor plugins, the actual output is generated
1463
+ by the plugin itself. Hence, we use a generic type for the response data
1464
+ """
1465
+
1466
+
1467
+ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest,
1468
+ IOProcessorRequest]
1469
+
1470
+
1471
+ class ScoreRequest(OpenAIBaseModel):
1472
+ model: Optional[str] = None
1473
+ text_1: Union[list[str], str, ScoreMultiModalParam]
1474
+ text_2: Union[list[str], str, ScoreMultiModalParam]
1475
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1476
+
1477
+ # --8<-- [start:score-extra-params]
1478
+
1479
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1480
+ default=None,
1481
+ description=("Additional kwargs to pass to the HF processor."),
1482
+ )
1483
+
1484
+ priority: int = Field(
1485
+ default=0,
1486
+ description=(
1487
+ "The priority of the request (lower means earlier handling; "
1488
+ "default: 0). Any priority other than 0 will raise an error "
1489
+ "if the served model does not use priority scheduling."),
1490
+ )
1491
+
1492
+ activation: Optional[bool] = None
1493
+
1494
+ # --8<-- [end:score-extra-params]
1495
+
1496
+ def to_pooling_params(self):
1497
+ return PoolingParams(
1498
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1499
+ activation=self.activation)
1500
+
1501
+
1502
+ class RerankRequest(OpenAIBaseModel):
1503
+ model: Optional[str] = None
1504
+ query: Union[str, ScoreMultiModalParam]
1505
+ documents: Union[list[str], ScoreMultiModalParam]
1506
+ top_n: int = Field(default_factory=lambda: 0)
1507
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1508
+
1509
+ # --8<-- [start:rerank-extra-params]
1510
+
1511
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1512
+ default=None,
1513
+ description=("Additional kwargs to pass to the HF processor."),
1514
+ )
1515
+
1516
+ priority: int = Field(
1517
+ default=0,
1518
+ description=(
1519
+ "The priority of the request (lower means earlier handling; "
1520
+ "default: 0). Any priority other than 0 will raise an error "
1521
+ "if the served model does not use priority scheduling."),
1522
+ )
1523
+
1524
+ activation: Optional[bool] = None
1525
+
1526
+ # --8<-- [end:rerank-extra-params]
1527
+
1528
+ def to_pooling_params(self):
1529
+ return PoolingParams(
1530
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1531
+ activation=self.activation)
1532
+
1533
+
1534
+ class RerankDocument(BaseModel):
1535
+ text: Optional[str] = None
1536
+ multi_modal: Optional[ScoreContentPartParam] = None
1537
+
1538
+
1539
+ class RerankResult(BaseModel):
1540
+ index: int
1541
+ document: RerankDocument
1542
+ relevance_score: float
1543
+
1544
+
1545
+ class RerankUsage(BaseModel):
1546
+ total_tokens: int
1547
+
1548
+
1549
+ class RerankResponse(OpenAIBaseModel):
1550
+ id: str
1551
+ model: str
1552
+ usage: RerankUsage
1553
+ results: list[RerankResult]
1554
+
1555
+
1556
+ class CompletionLogProbs(OpenAIBaseModel):
1557
+ text_offset: list[int] = Field(default_factory=list)
1558
+ token_logprobs: list[Optional[float]] = Field(default_factory=list)
1559
+ tokens: list[str] = Field(default_factory=list)
1560
+ top_logprobs: list[Optional[dict[str,
1561
+ float]]] = Field(default_factory=list)
1562
+
1563
+
1564
+ class CompletionResponseChoice(OpenAIBaseModel):
1565
+ index: int
1566
+ text: str
1567
+ logprobs: Optional[CompletionLogProbs] = None
1568
+ finish_reason: Optional[str] = None
1569
+ stop_reason: Optional[Union[int, str]] = Field(
1570
+ default=None,
1571
+ description=(
1572
+ "The stop string or token id that caused the completion "
1573
+ "to stop, None if the completion finished for some other reason "
1574
+ "including encountering the EOS token"),
1575
+ )
1576
+ token_ids: Optional[list[int]] = None # For response
1577
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1578
+ prompt_token_ids: Optional[list[int]] = None # For prompt
1579
+
1580
+
1581
+ class CompletionResponse(OpenAIBaseModel):
1582
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1583
+ object: Literal["text_completion"] = "text_completion"
1584
+ created: int = Field(default_factory=lambda: int(time.time()))
1585
+ model: str
1586
+ choices: list[CompletionResponseChoice]
1587
+ service_tier: Optional[Literal["auto", "default", "flex", "scale",
1588
+ "priority"]] = None
1589
+ system_fingerprint: Optional[str] = None
1590
+ usage: UsageInfo
1591
+
1592
+ # vLLM-specific fields that are not in OpenAI spec
1593
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1594
+ default=None, description="KVTransfer parameters.")
1595
+
1596
+
1597
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1598
+ index: int
1599
+ text: str
1600
+ logprobs: Optional[CompletionLogProbs] = None
1601
+ finish_reason: Optional[str] = None
1602
+ stop_reason: Optional[Union[int, str]] = Field(
1603
+ default=None,
1604
+ description=(
1605
+ "The stop string or token id that caused the completion "
1606
+ "to stop, None if the completion finished for some other reason "
1607
+ "including encountering the EOS token"),
1608
+ )
1609
+ # not part of the OpenAI spec but for tracing the tokens
1610
+ # prompt tokens is put into choice to align with CompletionResponseChoice
1611
+ prompt_token_ids: Optional[list[int]] = None
1612
+ token_ids: Optional[list[int]] = None
1613
+
1614
+
1615
+ class CompletionStreamResponse(OpenAIBaseModel):
1616
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1617
+ object: str = "text_completion"
1618
+ created: int = Field(default_factory=lambda: int(time.time()))
1619
+ model: str
1620
+ choices: list[CompletionResponseStreamChoice]
1621
+ usage: Optional[UsageInfo] = Field(default=None)
1622
+
1623
+
1624
+ class EmbeddingResponseData(OpenAIBaseModel):
1625
+ index: int
1626
+ object: str = "embedding"
1627
+ embedding: Union[list[float], str]
1628
+
1629
+
1630
+ class EmbeddingResponse(OpenAIBaseModel):
1631
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1632
+ object: str = "list"
1633
+ created: int = Field(default_factory=lambda: int(time.time()))
1634
+ model: str
1635
+ data: list[EmbeddingResponseData]
1636
+ usage: UsageInfo
1637
+
1638
+
1639
+ class PoolingResponseData(OpenAIBaseModel):
1640
+ index: int
1641
+ object: str = "pooling"
1642
+ data: Union[list[list[float]], list[float], str]
1643
+
1644
+
1645
+ class PoolingResponse(OpenAIBaseModel):
1646
+ id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
1647
+ object: str = "list"
1648
+ created: int = Field(default_factory=lambda: int(time.time()))
1649
+ model: str
1650
+ data: list[PoolingResponseData]
1651
+ usage: UsageInfo
1652
+
1653
+
1654
+ class ScoreResponseData(OpenAIBaseModel):
1655
+ index: int
1656
+ object: str = "score"
1657
+ score: float
1658
+
1659
+
1660
+ class ScoreResponse(OpenAIBaseModel):
1661
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1662
+ object: str = "list"
1663
+ created: int = Field(default_factory=lambda: int(time.time()))
1664
+ model: str
1665
+ data: list[ScoreResponseData]
1666
+ usage: UsageInfo
1667
+
1668
+
1669
+ class ClassificationRequest(OpenAIBaseModel):
1670
+ model: Optional[str] = None
1671
+ input: Union[list[str], str]
1672
+ truncate_prompt_tokens: Optional[int] = None
1673
+ user: Optional[str] = None
1674
+
1675
+ # --8<-- [start:classification-extra-params]
1676
+ priority: int = Field(
1677
+ default=0,
1678
+ description=(
1679
+ "The priority of the request (lower means earlier handling; "
1680
+ "default: 0). Any priority other than 0 will raise an error "
1681
+ "if the served model does not use priority scheduling."),
1682
+ )
1683
+
1684
+ activation: Optional[bool] = None
1685
+
1686
+ # --8<-- [end:classification-extra-params]
1687
+
1688
+ def to_pooling_params(self):
1689
+ return PoolingParams(
1690
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1691
+ activation=self.activation)
1692
+
1693
+
1694
+ class ClassificationData(OpenAIBaseModel):
1695
+ index: int
1696
+ label: Optional[str]
1697
+ probs: list[float]
1698
+ num_classes: int
1699
+
1700
+
1701
+ class ClassificationResponse(OpenAIBaseModel):
1702
+ id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
1703
+ object: str = "list"
1704
+ created: int = Field(default_factory=lambda: int(time.time()))
1705
+ model: str
1706
+ data: list[ClassificationData]
1707
+ usage: UsageInfo
1708
+
1709
+
1710
+ class FunctionCall(OpenAIBaseModel):
1711
+ name: str
1712
+ arguments: str
1713
+
1714
+
1715
+ class ToolCall(OpenAIBaseModel):
1716
+ id: str = Field(default_factory=make_tool_call_id)
1717
+ type: Literal["function"] = "function"
1718
+ function: FunctionCall
1719
+
1720
+
1721
+ class DeltaFunctionCall(BaseModel):
1722
+ name: Optional[str] = None
1723
+ arguments: Optional[str] = None
1724
+
1725
+
1726
+ # a tool call delta where everything is optional
1727
+ class DeltaToolCall(OpenAIBaseModel):
1728
+ id: Optional[str] = None
1729
+ type: Optional[Literal["function"]] = None
1730
+ index: int
1731
+ function: Optional[DeltaFunctionCall] = None
1732
+
1733
+
1734
+ class ExtractedToolCallInformation(BaseModel):
1735
+ # indicate if tools were called
1736
+ tools_called: bool
1737
+
1738
+ # extracted tool calls
1739
+ tool_calls: list[ToolCall]
1740
+
1741
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1742
+ # But some models will do this intentionally
1743
+ content: Optional[str] = None
1744
+
1745
+
1746
+ class ChatMessage(OpenAIBaseModel):
1747
+ role: str
1748
+ content: Optional[str] = None
1749
+ refusal: Optional[str] = None
1750
+ annotations: Optional[OpenAIAnnotation] = None
1751
+ audio: Optional[OpenAIChatCompletionAudio] = None
1752
+ function_call: Optional[FunctionCall] = None
1753
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1754
+
1755
+ # vLLM-specific fields that are not in OpenAI spec
1756
+ reasoning_content: Optional[str] = None
1757
+
1758
+
1759
+ class ChatCompletionLogProb(OpenAIBaseModel):
1760
+ token: str
1761
+ logprob: float = -9999.0
1762
+ bytes: Optional[list[int]] = None
1763
+
1764
+
1765
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1766
+ # Workaround: redefine fields name cache so that it's not
1767
+ # shared with the super class.
1768
+ field_names: ClassVar[Optional[set[str]]] = None
1769
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1770
+
1771
+
1772
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1773
+ content: Optional[list[ChatCompletionLogProbsContent]] = None
1774
+
1775
+
1776
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1777
+ index: int
1778
+ message: ChatMessage
1779
+ logprobs: Optional[ChatCompletionLogProbs] = None
1780
+ # per OpenAI spec this is the default
1781
+ finish_reason: Optional[str] = "stop"
1782
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1783
+ stop_reason: Optional[Union[int, str]] = None
1784
+ # not part of the OpenAI spec but is useful for tracing the tokens
1785
+ # in agent scenarios
1786
+ token_ids: Optional[list[int]] = None
1787
+
1788
+
1789
+ class ChatCompletionResponse(OpenAIBaseModel):
1790
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1791
+ object: Literal["chat.completion"] = "chat.completion"
1792
+ created: int = Field(default_factory=lambda: int(time.time()))
1793
+ model: str
1794
+ choices: list[ChatCompletionResponseChoice]
1795
+ service_tier: Optional[Literal["auto", "default", "flex", "scale",
1796
+ "priority"]] = None
1797
+ system_fingerprint: Optional[str] = None
1798
+ usage: UsageInfo
1799
+
1800
+ # vLLM-specific fields that are not in OpenAI spec
1801
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1802
+ prompt_token_ids: Optional[list[int]] = None
1803
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1804
+ default=None, description="KVTransfer parameters.")
1805
+
1806
+
1807
+ class DeltaMessage(OpenAIBaseModel):
1808
+ role: Optional[str] = None
1809
+ content: Optional[str] = None
1810
+ reasoning_content: Optional[str] = None
1811
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1812
+
1813
+
1814
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1815
+ index: int
1816
+ delta: DeltaMessage
1817
+ logprobs: Optional[ChatCompletionLogProbs] = None
1818
+ finish_reason: Optional[str] = None
1819
+ stop_reason: Optional[Union[int, str]] = None
1820
+ # not part of the OpenAI spec but for tracing the tokens
1821
+ token_ids: Optional[list[int]] = None
1822
+
1823
+
1824
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1825
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1826
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1827
+ created: int = Field(default_factory=lambda: int(time.time()))
1828
+ model: str
1829
+ choices: list[ChatCompletionResponseStreamChoice]
1830
+ usage: Optional[UsageInfo] = Field(default=None)
1831
+ # not part of the OpenAI spec but for tracing the tokens
1832
+ prompt_token_ids: Optional[list[int]] = None
1833
+
1834
+
1835
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1836
+ delta: DeltaMessage
1837
+ finish_reason: Optional[str] = None
1838
+ stop_reason: Optional[Union[int, str]] = None
1839
+
1840
+
1841
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1842
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1843
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1844
+ created: int = Field(default_factory=lambda: int(time.time()))
1845
+ model: str
1846
+ choices: list[TranscriptionResponseStreamChoice]
1847
+ usage: Optional[UsageInfo] = Field(default=None)
1848
+
1849
+
1850
+ class InputTokensDetails(OpenAIBaseModel):
1851
+ cached_tokens: int
1852
+
1853
+
1854
+ class OutputTokensDetails(OpenAIBaseModel):
1855
+ reasoning_tokens: int = 0
1856
+ tool_output_tokens: int = 0
1857
+
1858
+
1859
+ class ResponseUsage(OpenAIBaseModel):
1860
+ input_tokens: int
1861
+ input_tokens_details: InputTokensDetails
1862
+ output_tokens: int
1863
+ output_tokens_details: OutputTokensDetails
1864
+ total_tokens: int
1865
+
1866
+
1867
+ class ResponsesResponse(OpenAIBaseModel):
1868
+ id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
1869
+ created_at: int = Field(default_factory=lambda: int(time.time()))
1870
+ # error: Optional[ResponseError] = None
1871
+ # incomplete_details: Optional[IncompleteDetails] = None
1872
+ instructions: Optional[str] = None
1873
+ metadata: Optional[Metadata] = None
1874
+ model: str
1875
+ object: Literal["response"] = "response"
1876
+ output: list[ResponseOutputItem]
1877
+ parallel_tool_calls: bool
1878
+ temperature: float
1879
+ tool_choice: ToolChoice
1880
+ tools: list[Tool]
1881
+ top_p: float
1882
+ background: bool
1883
+ max_output_tokens: int
1884
+ max_tool_calls: Optional[int] = None
1885
+ previous_response_id: Optional[str] = None
1886
+ prompt: Optional[ResponsePrompt] = None
1887
+ reasoning: Optional[Reasoning] = None
1888
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"]
1889
+ status: ResponseStatus
1890
+ text: Optional[ResponseTextConfig] = None
1891
+ top_logprobs: Optional[int] = None
1892
+ truncation: Literal["auto", "disabled"]
1893
+ usage: Optional[ResponseUsage] = None
1894
+ user: Optional[str] = None
1895
+
1896
+ @classmethod
1897
+ def from_request(
1898
+ cls,
1899
+ request: ResponsesRequest,
1900
+ sampling_params: SamplingParams,
1901
+ model_name: str,
1902
+ created_time: int,
1903
+ output: list[ResponseOutputItem],
1904
+ status: ResponseStatus,
1905
+ usage: Optional[ResponseUsage] = None,
1906
+ ) -> "ResponsesResponse":
1907
+ return cls(
1908
+ id=request.request_id,
1909
+ created_at=created_time,
1910
+ instructions=request.instructions,
1911
+ metadata=request.metadata,
1912
+ model=model_name,
1913
+ output=output,
1914
+ parallel_tool_calls=request.parallel_tool_calls,
1915
+ temperature=sampling_params.temperature,
1916
+ tool_choice=request.tool_choice,
1917
+ tools=request.tools,
1918
+ top_p=sampling_params.top_p,
1919
+ background=request.background,
1920
+ max_output_tokens=sampling_params.max_tokens,
1921
+ max_tool_calls=request.max_tool_calls,
1922
+ previous_response_id=request.previous_response_id,
1923
+ prompt=request.prompt,
1924
+ reasoning=request.reasoning,
1925
+ service_tier=request.service_tier,
1926
+ status=status,
1927
+ text=request.text,
1928
+ top_logprobs=sampling_params.logprobs,
1929
+ truncation=request.truncation,
1930
+ user=request.user,
1931
+ usage=usage,
1932
+ )
1933
+
1934
+
1935
+ BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
1936
+ ScoreRequest, RerankRequest]
1937
+
1938
+
1939
+ class BatchRequestInput(OpenAIBaseModel):
1940
+ """
1941
+ The per-line object of the batch input file.
1942
+
1943
+ NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
1944
+ """
1945
+
1946
+ # A developer-provided per-request id that will be used to match outputs to
1947
+ # inputs. Must be unique for each request in a batch.
1948
+ custom_id: str
1949
+
1950
+ # The HTTP method to be used for the request. Currently only POST is
1951
+ # supported.
1952
+ method: str
1953
+
1954
+ # The OpenAI API relative URL to be used for the request. Currently
1955
+ # /v1/chat/completions is supported.
1956
+ url: str
1957
+
1958
+ # The parameters of the request.
1959
+ body: BatchRequestInputBody
1960
+
1961
+ @field_validator('body', mode='plain')
1962
+ @classmethod
1963
+ def check_type_for_url(cls, value: Any, info: ValidationInfo):
1964
+ # Use url to disambiguate models
1965
+ url: str = info.data["url"]
1966
+ if url == "/v1/chat/completions":
1967
+ return ChatCompletionRequest.model_validate(value)
1968
+ if url == "/v1/embeddings":
1969
+ return TypeAdapter(EmbeddingRequest).validate_python(value)
1970
+ if url.endswith("/score"):
1971
+ return ScoreRequest.model_validate(value)
1972
+ if url.endswith("/rerank"):
1973
+ return RerankRequest.model_validate(value)
1974
+ return TypeAdapter(BatchRequestInputBody).validate_python(value)
1975
+
1976
+
1977
+ class BatchResponseData(OpenAIBaseModel):
1978
+ # HTTP status code of the response.
1979
+ status_code: int = 200
1980
+
1981
+ # An unique identifier for the API request.
1982
+ request_id: str
1983
+
1984
+ # The body of the response.
1985
+ body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
1986
+ ScoreResponse, RerankResponse]] = None
1987
+
1988
+
1989
+ class BatchRequestOutput(OpenAIBaseModel):
1990
+ """
1991
+ The per-line object of the batch output and error files
1992
+ """
1993
+
1994
+ id: str
1995
+
1996
+ # A developer-provided per-request id that will be used to match outputs to
1997
+ # inputs.
1998
+ custom_id: str
1999
+
2000
+ response: Optional[BatchResponseData]
2001
+
2002
+ # For requests that failed with a non-HTTP error, this will contain more
2003
+ # information on the cause of the failure.
2004
+ error: Optional[Any]
2005
+
2006
+
2007
+ class TokenizeCompletionRequest(OpenAIBaseModel):
2008
+ model: Optional[str] = None
2009
+ prompt: str
2010
+
2011
+ add_special_tokens: bool = Field(
2012
+ default=True,
2013
+ description=(
2014
+ "If true (the default), special tokens (e.g. BOS) will be added to "
2015
+ "the prompt."),
2016
+ )
2017
+ return_token_strs: Optional[bool] = Field(
2018
+ default=False,
2019
+ description=("If true, also return the token strings "
2020
+ "corresponding to the token ids."),
2021
+ )
2022
+
2023
+
2024
+ class TokenizeChatRequest(OpenAIBaseModel):
2025
+ model: Optional[str] = None
2026
+ messages: list[ChatCompletionMessageParam]
2027
+
2028
+ add_generation_prompt: bool = Field(
2029
+ default=True,
2030
+ description=
2031
+ ("If true, the generation prompt will be added to the chat template. "
2032
+ "This is a parameter used by chat template in tokenizer config of the "
2033
+ "model."),
2034
+ )
2035
+ return_token_strs: Optional[bool] = Field(
2036
+ default=False,
2037
+ description=("If true, also return the token strings "
2038
+ "corresponding to the token ids."),
2039
+ )
2040
+ continue_final_message: bool = Field(
2041
+ default=False,
2042
+ description=
2043
+ ("If this is set, the chat will be formatted so that the final "
2044
+ "message in the chat is open-ended, without any EOS tokens. The "
2045
+ "model will continue this message rather than starting a new one. "
2046
+ "This allows you to \"prefill\" part of the model's response for it. "
2047
+ "Cannot be used at the same time as `add_generation_prompt`."),
2048
+ )
2049
+ add_special_tokens: bool = Field(
2050
+ default=False,
2051
+ description=(
2052
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
2053
+ "on top of what is added by the chat template. "
2054
+ "For most models, the chat template takes care of adding the "
2055
+ "special tokens so this should be set to false (as is the "
2056
+ "default)."),
2057
+ )
2058
+ chat_template: Optional[str] = Field(
2059
+ default=None,
2060
+ description=(
2061
+ "A Jinja template to use for this conversion. "
2062
+ "As of transformers v4.44, default chat template is no longer "
2063
+ "allowed, so you must provide a chat template if the tokenizer "
2064
+ "does not define one."),
2065
+ )
2066
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
2067
+ default=None,
2068
+ description=(
2069
+ "Additional keyword args to pass to the template renderer. "
2070
+ "Will be accessible by the chat template."),
2071
+ )
2072
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
2073
+ default=None,
2074
+ description=("Additional kwargs to pass to the HF processor."),
2075
+ )
2076
+ tools: Optional[list[ChatCompletionToolsParam]] = Field(
2077
+ default=None,
2078
+ description=("A list of tools the model may call."),
2079
+ )
2080
+
2081
+ @model_validator(mode="before")
2082
+ @classmethod
2083
+ def check_generation_prompt(cls, data):
2084
+ if data.get("continue_final_message") and data.get(
2085
+ "add_generation_prompt"):
2086
+ raise ValueError("Cannot set both `continue_final_message` and "
2087
+ "`add_generation_prompt` to True.")
2088
+ return data
2089
+
2090
+
2091
+ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
2092
+
2093
+
2094
+ class TokenizeResponse(OpenAIBaseModel):
2095
+ count: int
2096
+ max_model_len: int
2097
+ tokens: list[int]
2098
+ token_strs: Optional[list[str]] = None
2099
+
2100
+
2101
+ class DetokenizeRequest(OpenAIBaseModel):
2102
+ model: Optional[str] = None
2103
+ tokens: list[int]
2104
+
2105
+
2106
+ class DetokenizeResponse(OpenAIBaseModel):
2107
+ prompt: str
2108
+
2109
+
2110
+ class TokenizerInfoResponse(OpenAIBaseModel):
2111
+ """
2112
+ Response containing tokenizer configuration
2113
+ equivalent to tokenizer_config.json
2114
+ """
2115
+
2116
+ model_config = ConfigDict(extra="allow")
2117
+ tokenizer_class: str
2118
+
2119
+
2120
+ class LoadLoRAAdapterRequest(BaseModel):
2121
+ lora_name: str
2122
+ lora_path: str
2123
+
2124
+
2125
+ class UnloadLoRAAdapterRequest(BaseModel):
2126
+ lora_name: str
2127
+ lora_int_id: Optional[int] = Field(default=None)
2128
+
2129
+
2130
+ ## Protocols for Audio
2131
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
2132
+ "vtt"]
2133
+
2134
+
2135
+ class TranscriptionRequest(OpenAIBaseModel):
2136
+ # Ordered by official OpenAI API documentation
2137
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
2138
+
2139
+ file: UploadFile
2140
+ """
2141
+ The audio file object (not file name) to transcribe, in one of these
2142
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2143
+ """
2144
+
2145
+ model: Optional[str] = None
2146
+ """ID of the model to use.
2147
+ """
2148
+
2149
+ language: Optional[str] = None
2150
+ """The language of the input audio.
2151
+
2152
+ Supplying the input language in
2153
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2154
+ will improve accuracy and latency.
2155
+ """
2156
+
2157
+ prompt: str = Field(default="")
2158
+ """An optional text to guide the model's style or continue a previous audio
2159
+ segment.
2160
+
2161
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2162
+ should match the audio language.
2163
+ """
2164
+
2165
+ response_format: AudioResponseFormat = Field(default="json")
2166
+ """
2167
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2168
+ `verbose_json`, or `vtt`.
2169
+ """
2170
+
2171
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
2172
+
2173
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
2174
+ alias="timestamp_granularities[]", default=[])
2175
+ """The timestamp granularities to populate for this transcription.
2176
+
2177
+ `response_format` must be set `verbose_json` to use timestamp granularities.
2178
+ Either or both of these options are supported: `word`, or `segment`. Note:
2179
+ There is no additional latency for segment timestamps, but generating word
2180
+ timestamps incurs additional latency.
2181
+ """
2182
+
2183
+ stream: Optional[bool] = False
2184
+ """When set, it will enable output to be streamed in a similar fashion
2185
+ as the Chat Completion endpoint.
2186
+ """
2187
+ # --8<-- [start:transcription-extra-params]
2188
+ # Flattened stream option to simplify form data.
2189
+ stream_include_usage: Optional[bool] = False
2190
+ stream_continuous_usage_stats: Optional[bool] = False
2191
+
2192
+ vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
2193
+ default=None,
2194
+ description=("Additional request parameters with string or "
2195
+ "numeric values, used by custom extensions."),
2196
+ )
2197
+ # --8<-- [end:transcription-extra-params]
2198
+
2199
+ to_language: Optional[str] = None
2200
+ """The language of the output audio we transcribe to.
2201
+
2202
+ Please note that this is not currently used by supported models at this
2203
+ time, but it is a placeholder for future use, matching translation api.
2204
+ """
2205
+
2206
+ # --8<-- [start:transcription-sampling-params]
2207
+ temperature: float = Field(default=0.0)
2208
+ """The sampling temperature, between 0 and 1.
2209
+
2210
+ Higher values like 0.8 will make the output more random, while lower values
2211
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2212
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2213
+ to automatically increase the temperature until certain thresholds are hit.
2214
+ """
2215
+
2216
+ top_p: Optional[float] = None
2217
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
2218
+ smallest possible set whose cumulative probability exceeds `p`.
2219
+ """
2220
+
2221
+ top_k: Optional[int] = None
2222
+ """Limits sampling to the `k` most probable tokens at each step."""
2223
+
2224
+ min_p: Optional[float] = None
2225
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
2226
+ minimum likelihood threshold during sampling.
2227
+ """
2228
+
2229
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2230
+ """The seed to use for sampling."""
2231
+
2232
+ frequency_penalty: Optional[float] = 0.0
2233
+ """The frequency penalty to use for sampling."""
2234
+
2235
+ repetition_penalty: Optional[float] = None
2236
+ """The repetition penalty to use for sampling."""
2237
+
2238
+ presence_penalty: Optional[float] = 0.0
2239
+ """The presence penalty to use for sampling."""
2240
+ # --8<-- [end:transcription-sampling-params]
2241
+
2242
+ # Default sampling parameters for transcription requests.
2243
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2244
+ "repetition_penalty": 1.0,
2245
+ "temperature": 1.0,
2246
+ "top_p": 1.0,
2247
+ "top_k": 0,
2248
+ "min_p": 0.0,
2249
+ }
2250
+
2251
+ def to_sampling_params(
2252
+ self,
2253
+ default_max_tokens: int,
2254
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
2255
+
2256
+ max_tokens = default_max_tokens
2257
+
2258
+ if default_sampling_params is None:
2259
+ default_sampling_params = {}
2260
+
2261
+ # Default parameters
2262
+ if (temperature := self.temperature) is None:
2263
+ temperature = default_sampling_params.get(
2264
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
2265
+ if (top_p := self.top_p) is None:
2266
+ top_p = default_sampling_params.get(
2267
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
2268
+ if (top_k := self.top_k) is None:
2269
+ top_k = default_sampling_params.get(
2270
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
2271
+ if (min_p := self.min_p) is None:
2272
+ min_p = default_sampling_params.get(
2273
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
2274
+
2275
+ if (repetition_penalty := self.repetition_penalty) is None:
2276
+ repetition_penalty = default_sampling_params.get(
2277
+ "repetition_penalty",
2278
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
2279
+
2280
+ return SamplingParams.from_optional(temperature=temperature,
2281
+ max_tokens=max_tokens,
2282
+ seed=self.seed,
2283
+ top_p=top_p,
2284
+ top_k=top_k,
2285
+ min_p=min_p,
2286
+ frequency_penalty=self.frequency_penalty,
2287
+ repetition_penalty=repetition_penalty,
2288
+ presence_penalty=self.presence_penalty,
2289
+ output_kind=RequestOutputKind.DELTA
2290
+ if self.stream \
2291
+ else RequestOutputKind.FINAL_ONLY,
2292
+ extra_args=self.vllm_xargs)
2293
+
2294
+ @model_validator(mode="before")
2295
+ @classmethod
2296
+ def validate_transcription_request(cls, data):
2297
+ if isinstance(data.get("file"), str):
2298
+ raise HTTPException(
2299
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
2300
+ detail="Expected 'file' to be a file-like object, not 'str'.",
2301
+ )
2302
+
2303
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2304
+ stream = data.get("stream", False)
2305
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2306
+ raise ValueError(
2307
+ "Stream options can only be defined when `stream=True`.")
2308
+
2309
+ return data
2310
+
2311
+
2312
+ # Transcription response objects
2313
+ class TranscriptionUsageAudio(OpenAIBaseModel):
2314
+ type: Literal["duration"] = "duration"
2315
+ seconds: int
2316
+
2317
+
2318
+ class TranscriptionResponse(OpenAIBaseModel):
2319
+ text: str
2320
+ """The transcribed text."""
2321
+ usage: TranscriptionUsageAudio
2322
+
2323
+
2324
+ class TranscriptionWord(OpenAIBaseModel):
2325
+ end: float
2326
+ """End time of the word in seconds."""
2327
+
2328
+ start: float
2329
+ """Start time of the word in seconds."""
2330
+
2331
+ word: str
2332
+ """The text content of the word."""
2333
+
2334
+
2335
+ class TranscriptionSegment(OpenAIBaseModel):
2336
+ id: int
2337
+ """Unique identifier of the segment."""
2338
+
2339
+ avg_logprob: float
2340
+ """Average logprob of the segment.
2341
+
2342
+ If the value is lower than -1, consider the logprobs failed.
2343
+ """
2344
+
2345
+ compression_ratio: float
2346
+ """Compression ratio of the segment.
2347
+
2348
+ If the value is greater than 2.4, consider the compression failed.
2349
+ """
2350
+
2351
+ end: float
2352
+ """End time of the segment in seconds."""
2353
+
2354
+ no_speech_prob: float
2355
+ """Probability of no speech in the segment.
2356
+
2357
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2358
+ this segment silent.
2359
+ """
2360
+
2361
+ seek: int
2362
+ """Seek offset of the segment."""
2363
+
2364
+ start: float
2365
+ """Start time of the segment in seconds."""
2366
+
2367
+ temperature: float
2368
+ """Temperature parameter used for generating the segment."""
2369
+
2370
+ text: str
2371
+ """Text content of the segment."""
2372
+
2373
+ tokens: list[int]
2374
+ """Array of token IDs for the text content."""
2375
+
2376
+
2377
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
2378
+ duration: str
2379
+ """The duration of the input audio."""
2380
+
2381
+ language: str
2382
+ """The language of the input audio."""
2383
+
2384
+ text: str
2385
+ """The transcribed text."""
2386
+
2387
+ segments: Optional[list[TranscriptionSegment]] = None
2388
+ """Segments of the transcribed text and their corresponding details."""
2389
+
2390
+ words: Optional[list[TranscriptionWord]] = None
2391
+ """Extracted words and their corresponding timestamps."""
2392
+
2393
+
2394
+ class TranslationResponseStreamChoice(OpenAIBaseModel):
2395
+ delta: DeltaMessage
2396
+ finish_reason: Optional[str] = None
2397
+ stop_reason: Optional[Union[int, str]] = None
2398
+
2399
+
2400
+ class TranslationStreamResponse(OpenAIBaseModel):
2401
+ id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
2402
+ object: Literal["translation.chunk"] = "translation.chunk"
2403
+ created: int = Field(default_factory=lambda: int(time.time()))
2404
+ model: str
2405
+ choices: list[TranslationResponseStreamChoice]
2406
+ usage: Optional[UsageInfo] = Field(default=None)
2407
+
2408
+
2409
+ class TranslationRequest(OpenAIBaseModel):
2410
+ # Ordered by official OpenAI API documentation
2411
+ # https://platform.openai.com/docs/api-reference/audio/createTranslation
2412
+
2413
+ file: UploadFile
2414
+ """
2415
+ The audio file object (not file name) to translate, in one of these
2416
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2417
+ """
2418
+
2419
+ model: Optional[str] = None
2420
+ """ID of the model to use.
2421
+ """
2422
+
2423
+ prompt: str = Field(default="")
2424
+ """An optional text to guide the model's style or continue a previous audio
2425
+ segment.
2426
+
2427
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2428
+ should match the audio language.
2429
+ """
2430
+
2431
+ response_format: AudioResponseFormat = Field(default="json")
2432
+ """
2433
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2434
+ `verbose_json`, or `vtt`.
2435
+ """
2436
+
2437
+ # TODO support additional sampling parameters
2438
+ # --8<-- [start:translation-sampling-params]
2439
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2440
+ """The seed to use for sampling."""
2441
+
2442
+ temperature: float = Field(default=0.0)
2443
+ """The sampling temperature, between 0 and 1.
2444
+
2445
+ Higher values like 0.8 will make the output more random, while lower values
2446
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2447
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2448
+ to automatically increase the temperature until certain thresholds are hit.
2449
+ """
2450
+ # --8<-- [end:translation-sampling-params]
2451
+
2452
+ # --8<-- [start:translation-extra-params]
2453
+ language: Optional[str] = None
2454
+ """The language of the input audio we translate from.
2455
+
2456
+ Supplying the input language in
2457
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2458
+ will improve accuracy.
2459
+ """
2460
+
2461
+ to_language: Optional[str] = None
2462
+ """The language of the input audio we translate to.
2463
+
2464
+ Please note that this is not supported by all models, refer to the specific
2465
+ model documentation for more details.
2466
+ For instance, Whisper only supports `to_language=en`.
2467
+ """
2468
+
2469
+ stream: Optional[bool] = False
2470
+ """Custom field not present in the original OpenAI definition. When set,
2471
+ it will enable output to be streamed in a similar fashion as the Chat
2472
+ Completion endpoint.
2473
+ """
2474
+ # Flattened stream option to simplify form data.
2475
+ stream_include_usage: Optional[bool] = False
2476
+ stream_continuous_usage_stats: Optional[bool] = False
2477
+ # --8<-- [end:translation-extra-params]
2478
+
2479
+ # Default sampling parameters for translation requests.
2480
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2481
+ "temperature": 0,
2482
+ }
2483
+
2484
+ def to_sampling_params(
2485
+ self,
2486
+ default_max_tokens: int,
2487
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
2488
+
2489
+ max_tokens = default_max_tokens
2490
+
2491
+ if default_sampling_params is None:
2492
+ default_sampling_params = {}
2493
+ # Default parameters
2494
+ if (temperature := self.temperature) is None:
2495
+ temperature = default_sampling_params.get(
2496
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
2497
+
2498
+ return SamplingParams.from_optional(temperature=temperature,
2499
+ max_tokens=max_tokens,
2500
+ seed=self.seed,
2501
+ output_kind=RequestOutputKind.DELTA
2502
+ if self.stream \
2503
+ else RequestOutputKind.FINAL_ONLY)
2504
+
2505
+ @model_validator(mode="before")
2506
+ @classmethod
2507
+ def validate_stream_options(cls, data):
2508
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2509
+ stream = data.get("stream", False)
2510
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2511
+ raise ValueError(
2512
+ "Stream options can only be defined when `stream=True`.")
2513
+
2514
+ return data
2515
+
2516
+
2517
+ # Translation response objects
2518
+ class TranslationResponse(OpenAIBaseModel):
2519
+ text: str
2520
+ """The translated text."""
2521
+
2522
+
2523
+ class TranslationWord(OpenAIBaseModel):
2524
+ end: float
2525
+ """End time of the word in seconds."""
2526
+
2527
+ start: float
2528
+ """Start time of the word in seconds."""
2529
+
2530
+ word: str
2531
+ """The text content of the word."""
2532
+
2533
+
2534
+ class TranslationSegment(OpenAIBaseModel):
2535
+ id: int
2536
+ """Unique identifier of the segment."""
2537
+
2538
+ avg_logprob: float
2539
+ """Average logprob of the segment.
2540
+
2541
+ If the value is lower than -1, consider the logprobs failed.
2542
+ """
2543
+
2544
+ compression_ratio: float
2545
+ """Compression ratio of the segment.
2546
+
2547
+ If the value is greater than 2.4, consider the compression failed.
2548
+ """
2549
+
2550
+ end: float
2551
+ """End time of the segment in seconds."""
2552
+
2553
+ no_speech_prob: float
2554
+ """Probability of no speech in the segment.
2555
+
2556
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2557
+ this segment silent.
2558
+ """
2559
+
2560
+ seek: int
2561
+ """Seek offset of the segment."""
2562
+
2563
+ start: float
2564
+ """Start time of the segment in seconds."""
2565
+
2566
+ temperature: float
2567
+ """Temperature parameter used for generating the segment."""
2568
+
2569
+ text: str
2570
+ """Text content of the segment."""
2571
+
2572
+ tokens: list[int]
2573
+ """Array of token IDs for the text content."""
2574
+
2575
+
2576
+ class TranslationResponseVerbose(OpenAIBaseModel):
2577
+ duration: str
2578
+ """The duration of the input audio."""
2579
+
2580
+ language: str
2581
+ """The language of the input audio."""
2582
+
2583
+ text: str
2584
+ """The translated text."""
2585
+
2586
+ segments: Optional[list[TranslationSegment]] = None
2587
+ """Segments of the translated text and their corresponding details."""
2588
+
2589
+ words: Optional[list[TranslationWord]] = None
2590
+ """Extracted words and their corresponding timestamps."""