vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1536) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +983 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +2863 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +18 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +391 -0
  16. vllm/attention/backends/registry.py +195 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +1052 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +121 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/ops/__init__.py +0 -0
  24. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  25. vllm/attention/ops/common.py +414 -0
  26. vllm/attention/ops/flashmla.py +251 -0
  27. vllm/attention/ops/merge_attn_states.py +47 -0
  28. vllm/attention/ops/paged_attn.py +262 -0
  29. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  30. vllm/attention/ops/prefix_prefill.py +814 -0
  31. vllm/attention/ops/rocm_aiter_paged_attn.py +123 -0
  32. vllm/attention/ops/triton_decode_attention.py +712 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +105 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  35. vllm/attention/ops/triton_unified_attention.py +941 -0
  36. vllm/attention/ops/vit_attn_wrappers.py +178 -0
  37. vllm/attention/selector.py +231 -0
  38. vllm/attention/utils/__init__.py +0 -0
  39. vllm/attention/utils/fa_utils.py +109 -0
  40. vllm/attention/utils/kv_sharing_utils.py +33 -0
  41. vllm/attention/utils/kv_transfer_utils.py +60 -0
  42. vllm/beam_search.py +88 -0
  43. vllm/benchmarks/__init__.py +0 -0
  44. vllm/benchmarks/datasets.py +3222 -0
  45. vllm/benchmarks/latency.py +172 -0
  46. vllm/benchmarks/lib/__init__.py +3 -0
  47. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  48. vllm/benchmarks/lib/ready_checker.py +72 -0
  49. vllm/benchmarks/lib/utils.py +79 -0
  50. vllm/benchmarks/serve.py +1531 -0
  51. vllm/benchmarks/sweep/__init__.py +0 -0
  52. vllm/benchmarks/sweep/cli.py +38 -0
  53. vllm/benchmarks/sweep/param_sweep.py +91 -0
  54. vllm/benchmarks/sweep/plot.py +580 -0
  55. vllm/benchmarks/sweep/serve.py +416 -0
  56. vllm/benchmarks/sweep/serve_sla.py +492 -0
  57. vllm/benchmarks/sweep/server.py +114 -0
  58. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  59. vllm/benchmarks/sweep/utils.py +4 -0
  60. vllm/benchmarks/throughput.py +799 -0
  61. vllm/collect_env.py +857 -0
  62. vllm/compilation/__init__.py +0 -0
  63. vllm/compilation/activation_quant_fusion.py +209 -0
  64. vllm/compilation/backends.py +759 -0
  65. vllm/compilation/base_static_graph.py +57 -0
  66. vllm/compilation/caching.py +178 -0
  67. vllm/compilation/collective_fusion.py +1234 -0
  68. vllm/compilation/compiler_interface.py +639 -0
  69. vllm/compilation/counter.py +48 -0
  70. vllm/compilation/cuda_graph.py +208 -0
  71. vllm/compilation/decorators.py +571 -0
  72. vllm/compilation/fix_functionalization.py +253 -0
  73. vllm/compilation/fusion.py +374 -0
  74. vllm/compilation/fusion_attn.py +359 -0
  75. vllm/compilation/fx_utils.py +91 -0
  76. vllm/compilation/inductor_pass.py +133 -0
  77. vllm/compilation/matcher_utils.py +317 -0
  78. vllm/compilation/monitor.py +62 -0
  79. vllm/compilation/noop_elimination.py +134 -0
  80. vllm/compilation/partition_rules.py +72 -0
  81. vllm/compilation/pass_manager.py +135 -0
  82. vllm/compilation/piecewise_backend.py +121 -0
  83. vllm/compilation/post_cleanup.py +21 -0
  84. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  85. vllm/compilation/sequence_parallelism.py +363 -0
  86. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  87. vllm/compilation/vllm_inductor_pass.py +173 -0
  88. vllm/compilation/wrapper.py +238 -0
  89. vllm/config/__init__.py +102 -0
  90. vllm/config/cache.py +207 -0
  91. vllm/config/compilation.py +975 -0
  92. vllm/config/device.py +75 -0
  93. vllm/config/ec_transfer.py +110 -0
  94. vllm/config/kv_events.py +56 -0
  95. vllm/config/kv_transfer.py +114 -0
  96. vllm/config/load.py +124 -0
  97. vllm/config/lora.py +112 -0
  98. vllm/config/model.py +2162 -0
  99. vllm/config/multimodal.py +248 -0
  100. vllm/config/observability.py +123 -0
  101. vllm/config/parallel.py +655 -0
  102. vllm/config/pooler.py +122 -0
  103. vllm/config/scheduler.py +298 -0
  104. vllm/config/speculative.py +654 -0
  105. vllm/config/speech_to_text.py +38 -0
  106. vllm/config/structured_outputs.py +92 -0
  107. vllm/config/utils.py +178 -0
  108. vllm/config/vllm.py +1166 -0
  109. vllm/connections.py +189 -0
  110. vllm/device_allocator/__init__.py +0 -0
  111. vllm/device_allocator/cumem.py +327 -0
  112. vllm/distributed/__init__.py +6 -0
  113. vllm/distributed/communication_op.py +43 -0
  114. vllm/distributed/device_communicators/__init__.py +0 -0
  115. vllm/distributed/device_communicators/all2all.py +490 -0
  116. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  117. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  118. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  119. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  120. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  121. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  122. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  123. vllm/distributed/device_communicators/pynccl.py +386 -0
  124. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  125. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  126. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  127. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  128. vllm/distributed/device_communicators/shm_broadcast.py +733 -0
  129. vllm/distributed/device_communicators/shm_object_storage.py +660 -0
  130. vllm/distributed/device_communicators/symm_mem.py +156 -0
  131. vllm/distributed/device_communicators/tpu_communicator.py +107 -0
  132. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  133. vllm/distributed/ec_transfer/__init__.py +14 -0
  134. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  135. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  136. vllm/distributed/ec_transfer/ec_connector/factory.py +88 -0
  137. vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +201 -0
  138. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  139. vllm/distributed/eplb/__init__.py +8 -0
  140. vllm/distributed/eplb/eplb_state.py +837 -0
  141. vllm/distributed/eplb/rebalance_algo.py +260 -0
  142. vllm/distributed/eplb/rebalance_execute.py +431 -0
  143. vllm/distributed/kv_events.py +371 -0
  144. vllm/distributed/kv_transfer/README.md +29 -0
  145. vllm/distributed/kv_transfer/__init__.py +20 -0
  146. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  147. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  149. vllm/distributed/kv_transfer/kv_connector/factory.py +192 -0
  150. vllm/distributed/kv_transfer/kv_connector/utils.py +268 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/base.py +546 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +216 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +379 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1411 -0
  159. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +867 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +189 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +454 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2440 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +504 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +450 -0
  169. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  170. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +179 -0
  171. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +164 -0
  172. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +242 -0
  173. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  174. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  175. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +295 -0
  176. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +285 -0
  177. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  178. vllm/distributed/parallel_state.py +1759 -0
  179. vllm/distributed/tpu_distributed_utils.py +188 -0
  180. vllm/distributed/utils.py +543 -0
  181. vllm/engine/__init__.py +0 -0
  182. vllm/engine/arg_utils.py +2144 -0
  183. vllm/engine/async_llm_engine.py +6 -0
  184. vllm/engine/llm_engine.py +6 -0
  185. vllm/engine/protocol.py +170 -0
  186. vllm/entrypoints/__init__.py +0 -0
  187. vllm/entrypoints/anthropic/__init__.py +0 -0
  188. vllm/entrypoints/anthropic/protocol.py +162 -0
  189. vllm/entrypoints/anthropic/serving_messages.py +460 -0
  190. vllm/entrypoints/api_server.py +184 -0
  191. vllm/entrypoints/chat_utils.py +1690 -0
  192. vllm/entrypoints/cli/__init__.py +13 -0
  193. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  194. vllm/entrypoints/cli/benchmark/base.py +25 -0
  195. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  196. vllm/entrypoints/cli/benchmark/main.py +56 -0
  197. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  198. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  199. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  200. vllm/entrypoints/cli/collect_env.py +38 -0
  201. vllm/entrypoints/cli/main.py +79 -0
  202. vllm/entrypoints/cli/openai.py +256 -0
  203. vllm/entrypoints/cli/run_batch.py +68 -0
  204. vllm/entrypoints/cli/serve.py +249 -0
  205. vllm/entrypoints/cli/types.py +29 -0
  206. vllm/entrypoints/constants.py +10 -0
  207. vllm/entrypoints/context.py +572 -0
  208. vllm/entrypoints/dynamic_lora.py +57 -0
  209. vllm/entrypoints/harmony_utils.py +535 -0
  210. vllm/entrypoints/launcher.py +175 -0
  211. vllm/entrypoints/llm.py +1768 -0
  212. vllm/entrypoints/logger.py +84 -0
  213. vllm/entrypoints/openai/__init__.py +0 -0
  214. vllm/entrypoints/openai/api_server.py +2096 -0
  215. vllm/entrypoints/openai/cli_args.py +302 -0
  216. vllm/entrypoints/openai/orca_metrics.py +120 -0
  217. vllm/entrypoints/openai/protocol.py +3299 -0
  218. vllm/entrypoints/openai/run_batch.py +547 -0
  219. vllm/entrypoints/openai/serving_chat.py +1772 -0
  220. vllm/entrypoints/openai/serving_classification.py +235 -0
  221. vllm/entrypoints/openai/serving_completion.py +715 -0
  222. vllm/entrypoints/openai/serving_embedding.py +695 -0
  223. vllm/entrypoints/openai/serving_engine.py +1433 -0
  224. vllm/entrypoints/openai/serving_models.py +304 -0
  225. vllm/entrypoints/openai/serving_pooling.py +346 -0
  226. vllm/entrypoints/openai/serving_responses.py +2021 -0
  227. vllm/entrypoints/openai/serving_score.py +503 -0
  228. vllm/entrypoints/openai/serving_tokenization.py +203 -0
  229. vllm/entrypoints/openai/serving_tokens.py +269 -0
  230. vllm/entrypoints/openai/serving_transcription.py +148 -0
  231. vllm/entrypoints/openai/speech_to_text.py +405 -0
  232. vllm/entrypoints/openai/tool_parsers/__init__.py +142 -0
  233. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +273 -0
  234. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +390 -0
  235. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +390 -0
  236. vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +210 -0
  237. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +200 -0
  238. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  239. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +253 -0
  240. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +494 -0
  241. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  242. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +227 -0
  243. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +323 -0
  244. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +590 -0
  245. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  246. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +290 -0
  247. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +37 -0
  248. vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +643 -0
  249. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +849 -0
  250. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +390 -0
  251. vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py +366 -0
  252. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +97 -0
  253. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +120 -0
  254. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +332 -0
  255. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +781 -0
  256. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  257. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +744 -0
  258. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +303 -0
  259. vllm/entrypoints/openai/tool_parsers/utils.py +229 -0
  260. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +556 -0
  261. vllm/entrypoints/renderer.py +409 -0
  262. vllm/entrypoints/responses_utils.py +77 -0
  263. vllm/entrypoints/sagemaker/__init__.py +4 -0
  264. vllm/entrypoints/sagemaker/routes.py +72 -0
  265. vllm/entrypoints/score_utils.py +242 -0
  266. vllm/entrypoints/ssl.py +78 -0
  267. vllm/entrypoints/tool.py +143 -0
  268. vllm/entrypoints/tool_server.py +209 -0
  269. vllm/entrypoints/utils.py +319 -0
  270. vllm/env_override.py +378 -0
  271. vllm/envs.py +1659 -0
  272. vllm/forward_context.py +356 -0
  273. vllm/inputs/__init__.py +44 -0
  274. vllm/inputs/data.py +359 -0
  275. vllm/inputs/parse.py +137 -0
  276. vllm/inputs/preprocess.py +727 -0
  277. vllm/logger.py +267 -0
  278. vllm/logging_utils/__init__.py +10 -0
  279. vllm/logging_utils/dump_input.py +83 -0
  280. vllm/logging_utils/formatter.py +77 -0
  281. vllm/logging_utils/log_time.py +34 -0
  282. vllm/logits_process.py +121 -0
  283. vllm/logprobs.py +208 -0
  284. vllm/lora/__init__.py +0 -0
  285. vllm/lora/layers/__init__.py +41 -0
  286. vllm/lora/layers/base.py +67 -0
  287. vllm/lora/layers/base_linear.py +164 -0
  288. vllm/lora/layers/column_parallel_linear.py +578 -0
  289. vllm/lora/layers/fused_moe.py +472 -0
  290. vllm/lora/layers/logits_processor.py +252 -0
  291. vllm/lora/layers/replicated_linear.py +70 -0
  292. vllm/lora/layers/row_parallel_linear.py +181 -0
  293. vllm/lora/layers/utils.py +65 -0
  294. vllm/lora/layers/vocal_parallel_embedding.py +166 -0
  295. vllm/lora/lora_weights.py +198 -0
  296. vllm/lora/models.py +890 -0
  297. vllm/lora/ops/__init__.py +0 -0
  298. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  299. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  300. vllm/lora/ops/torch_ops/__init__.py +20 -0
  301. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  302. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  303. vllm/lora/ops/triton_ops/__init__.py +21 -0
  304. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +641 -0
  305. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  306. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  307. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  308. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  309. vllm/lora/ops/triton_ops/utils.py +295 -0
  310. vllm/lora/ops/xla_ops/__init__.py +6 -0
  311. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  312. vllm/lora/peft_helper.py +128 -0
  313. vllm/lora/punica_wrapper/__init__.py +10 -0
  314. vllm/lora/punica_wrapper/punica_base.py +492 -0
  315. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  316. vllm/lora/punica_wrapper/punica_gpu.py +411 -0
  317. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  318. vllm/lora/punica_wrapper/punica_tpu.py +359 -0
  319. vllm/lora/punica_wrapper/punica_xpu.py +279 -0
  320. vllm/lora/punica_wrapper/utils.py +150 -0
  321. vllm/lora/request.py +100 -0
  322. vllm/lora/resolver.py +88 -0
  323. vllm/lora/utils.py +293 -0
  324. vllm/lora/worker_manager.py +279 -0
  325. vllm/model_executor/__init__.py +11 -0
  326. vllm/model_executor/custom_op.py +194 -0
  327. vllm/model_executor/layers/__init__.py +0 -0
  328. vllm/model_executor/layers/activation.py +569 -0
  329. vllm/model_executor/layers/attention_layer_base.py +35 -0
  330. vllm/model_executor/layers/batch_invariant.py +854 -0
  331. vllm/model_executor/layers/conv.py +236 -0
  332. vllm/model_executor/layers/fla/__init__.py +8 -0
  333. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  334. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  335. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  336. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  337. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  338. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  339. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  340. vllm/model_executor/layers/fla/ops/index.py +41 -0
  341. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  342. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  343. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  344. vllm/model_executor/layers/fla/ops/op.py +60 -0
  345. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  346. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  347. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  348. vllm/model_executor/layers/fused_moe/__init__.py +106 -0
  349. vllm/model_executor/layers/fused_moe/all2all_utils.py +160 -0
  350. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +406 -0
  351. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +180 -0
  352. vllm/model_executor/layers/fused_moe/config.py +916 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  624. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  625. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +354 -0
  626. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1052 -0
  627. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +387 -0
  628. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +416 -0
  629. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  630. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +367 -0
  631. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  632. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  633. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  634. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  635. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +792 -0
  636. vllm/model_executor/layers/fused_moe/fused_moe.py +2175 -0
  637. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +112 -0
  638. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +164 -0
  639. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +316 -0
  640. vllm/model_executor/layers/fused_moe/layer.py +1944 -0
  641. vllm/model_executor/layers/fused_moe/modular_kernel.py +1222 -0
  642. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +174 -0
  643. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  644. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  645. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  646. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  647. vllm/model_executor/layers/fused_moe/prepare_finalize.py +77 -0
  648. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  649. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  650. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +97 -0
  651. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  652. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  653. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  654. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +578 -0
  655. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  656. vllm/model_executor/layers/kda.py +448 -0
  657. vllm/model_executor/layers/layernorm.py +442 -0
  658. vllm/model_executor/layers/lightning_attn.py +729 -0
  659. vllm/model_executor/layers/linear.py +1424 -0
  660. vllm/model_executor/layers/logits_processor.py +106 -0
  661. vllm/model_executor/layers/mamba/__init__.py +0 -0
  662. vllm/model_executor/layers/mamba/abstract.py +71 -0
  663. vllm/model_executor/layers/mamba/linear_attn.py +402 -0
  664. vllm/model_executor/layers/mamba/mamba_mixer.py +535 -0
  665. vllm/model_executor/layers/mamba/mamba_mixer2.py +928 -0
  666. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  667. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  668. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  669. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  670. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +478 -0
  671. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  672. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  673. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  674. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  675. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  676. vllm/model_executor/layers/mamba/short_conv.py +264 -0
  677. vllm/model_executor/layers/mla.py +168 -0
  678. vllm/model_executor/layers/pooler.py +817 -0
  679. vllm/model_executor/layers/quantization/__init__.py +174 -0
  680. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  681. vllm/model_executor/layers/quantization/awq.py +277 -0
  682. vllm/model_executor/layers/quantization/awq_marlin.py +659 -0
  683. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  684. vllm/model_executor/layers/quantization/base_config.py +170 -0
  685. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  686. vllm/model_executor/layers/quantization/bitsandbytes.py +658 -0
  687. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  688. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +914 -0
  689. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2284 -0
  690. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  691. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  692. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  693. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  694. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  695. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  696. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +183 -0
  697. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  698. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  699. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  700. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  701. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +219 -0
  702. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  703. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  704. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  705. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  706. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  707. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  710. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  711. vllm/model_executor/layers/quantization/experts_int8.py +240 -0
  712. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  713. vllm/model_executor/layers/quantization/fp8.py +1333 -0
  714. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  715. vllm/model_executor/layers/quantization/gguf.py +643 -0
  716. vllm/model_executor/layers/quantization/gptq.py +393 -0
  717. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  718. vllm/model_executor/layers/quantization/gptq_marlin.py +789 -0
  719. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  720. vllm/model_executor/layers/quantization/hqq_marlin.py +371 -0
  721. vllm/model_executor/layers/quantization/inc.py +65 -0
  722. vllm/model_executor/layers/quantization/input_quant_fp8.py +171 -0
  723. vllm/model_executor/layers/quantization/ipex_quant.py +467 -0
  724. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  725. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  726. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +105 -0
  727. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  728. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  729. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  730. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +119 -0
  731. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  732. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  733. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  734. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +166 -0
  735. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +73 -0
  736. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +97 -0
  737. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  738. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +219 -0
  739. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +140 -0
  740. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +42 -0
  741. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  742. vllm/model_executor/layers/quantization/kv_cache.py +146 -0
  743. vllm/model_executor/layers/quantization/modelopt.py +1788 -0
  744. vllm/model_executor/layers/quantization/moe_wna16.py +541 -0
  745. vllm/model_executor/layers/quantization/mxfp4.py +1162 -0
  746. vllm/model_executor/layers/quantization/petit.py +320 -0
  747. vllm/model_executor/layers/quantization/ptpc_fp8.py +137 -0
  748. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  749. vllm/model_executor/layers/quantization/quark/quark.py +528 -0
  750. vllm/model_executor/layers/quantization/quark/quark_moe.py +683 -0
  751. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  752. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +306 -0
  753. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  754. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  755. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  756. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  757. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  758. vllm/model_executor/layers/quantization/rtn.py +652 -0
  759. vllm/model_executor/layers/quantization/schema.py +90 -0
  760. vllm/model_executor/layers/quantization/torchao.py +380 -0
  761. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  762. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  763. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  764. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  976. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +89 -0
  977. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +298 -0
  978. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1203 -0
  979. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  980. vllm/model_executor/layers/quantization/utils/int8_utils.py +489 -0
  981. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  982. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  983. vllm/model_executor/layers/quantization/utils/marlin_utils.py +575 -0
  984. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +397 -0
  985. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +351 -0
  986. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +161 -0
  987. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  988. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +181 -0
  989. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  990. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  991. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  992. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +63 -0
  993. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  994. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  995. vllm/model_executor/layers/quantization/utils/quant_utils.py +687 -0
  996. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +516 -0
  997. vllm/model_executor/layers/resampler.py +283 -0
  998. vllm/model_executor/layers/rotary_embedding/__init__.py +278 -0
  999. vllm/model_executor/layers/rotary_embedding/base.py +235 -0
  1000. vllm/model_executor/layers/rotary_embedding/common.py +188 -0
  1001. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1002. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1003. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1004. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1005. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +75 -0
  1006. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1007. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1008. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1009. vllm/model_executor/layers/rotary_embedding/mrope.py +397 -0
  1010. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1011. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1012. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +81 -0
  1013. vllm/model_executor/layers/utils.py +251 -0
  1014. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1015. vllm/model_executor/model_loader/__init__.py +148 -0
  1016. vllm/model_executor/model_loader/base_loader.py +57 -0
  1017. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1018. vllm/model_executor/model_loader/default_loader.py +327 -0
  1019. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1020. vllm/model_executor/model_loader/gguf_loader.py +176 -0
  1021. vllm/model_executor/model_loader/online_quantization.py +224 -0
  1022. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1023. vllm/model_executor/model_loader/sharded_state_loader.py +206 -0
  1024. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1025. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1026. vllm/model_executor/model_loader/tpu.py +118 -0
  1027. vllm/model_executor/model_loader/utils.py +288 -0
  1028. vllm/model_executor/model_loader/weight_utils.py +1084 -0
  1029. vllm/model_executor/models/__init__.py +44 -0
  1030. vllm/model_executor/models/adapters.py +543 -0
  1031. vllm/model_executor/models/afmoe.py +711 -0
  1032. vllm/model_executor/models/aimv2.py +247 -0
  1033. vllm/model_executor/models/apertus.py +587 -0
  1034. vllm/model_executor/models/arcee.py +439 -0
  1035. vllm/model_executor/models/arctic.py +635 -0
  1036. vllm/model_executor/models/aria.py +655 -0
  1037. vllm/model_executor/models/aya_vision.py +450 -0
  1038. vllm/model_executor/models/baichuan.py +496 -0
  1039. vllm/model_executor/models/bailing_moe.py +646 -0
  1040. vllm/model_executor/models/bamba.py +522 -0
  1041. vllm/model_executor/models/bee.py +157 -0
  1042. vllm/model_executor/models/bert.py +925 -0
  1043. vllm/model_executor/models/bert_with_rope.py +732 -0
  1044. vllm/model_executor/models/blip.py +349 -0
  1045. vllm/model_executor/models/blip2.py +695 -0
  1046. vllm/model_executor/models/bloom.py +390 -0
  1047. vllm/model_executor/models/chameleon.py +1120 -0
  1048. vllm/model_executor/models/chatglm.py +498 -0
  1049. vllm/model_executor/models/clip.py +965 -0
  1050. vllm/model_executor/models/cohere2_vision.py +472 -0
  1051. vllm/model_executor/models/commandr.py +473 -0
  1052. vllm/model_executor/models/config.py +503 -0
  1053. vllm/model_executor/models/dbrx.py +482 -0
  1054. vllm/model_executor/models/deepencoder.py +673 -0
  1055. vllm/model_executor/models/deepseek_eagle.py +260 -0
  1056. vllm/model_executor/models/deepseek_mtp.py +360 -0
  1057. vllm/model_executor/models/deepseek_ocr.py +593 -0
  1058. vllm/model_executor/models/deepseek_v2.py +1649 -0
  1059. vllm/model_executor/models/deepseek_vl2.py +655 -0
  1060. vllm/model_executor/models/dots1.py +574 -0
  1061. vllm/model_executor/models/dots_ocr.py +900 -0
  1062. vllm/model_executor/models/ernie45.py +53 -0
  1063. vllm/model_executor/models/ernie45_moe.py +759 -0
  1064. vllm/model_executor/models/ernie45_vl.py +1742 -0
  1065. vllm/model_executor/models/ernie45_vl_moe.py +803 -0
  1066. vllm/model_executor/models/ernie_mtp.py +279 -0
  1067. vllm/model_executor/models/exaone.py +545 -0
  1068. vllm/model_executor/models/exaone4.py +531 -0
  1069. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1070. vllm/model_executor/models/falcon.py +545 -0
  1071. vllm/model_executor/models/falcon_h1.py +685 -0
  1072. vllm/model_executor/models/flex_olmo.py +155 -0
  1073. vllm/model_executor/models/fuyu.py +373 -0
  1074. vllm/model_executor/models/gemma.py +426 -0
  1075. vllm/model_executor/models/gemma2.py +439 -0
  1076. vllm/model_executor/models/gemma3.py +571 -0
  1077. vllm/model_executor/models/gemma3_mm.py +741 -0
  1078. vllm/model_executor/models/gemma3n.py +1165 -0
  1079. vllm/model_executor/models/gemma3n_mm.py +811 -0
  1080. vllm/model_executor/models/glm.py +23 -0
  1081. vllm/model_executor/models/glm4.py +305 -0
  1082. vllm/model_executor/models/glm4_1v.py +1821 -0
  1083. vllm/model_executor/models/glm4_moe.py +747 -0
  1084. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1085. vllm/model_executor/models/glm4v.py +784 -0
  1086. vllm/model_executor/models/gpt2.py +397 -0
  1087. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1088. vllm/model_executor/models/gpt_j.py +346 -0
  1089. vllm/model_executor/models/gpt_neox.py +344 -0
  1090. vllm/model_executor/models/gpt_oss.py +738 -0
  1091. vllm/model_executor/models/granite.py +516 -0
  1092. vllm/model_executor/models/granite_speech.py +913 -0
  1093. vllm/model_executor/models/granitemoe.py +569 -0
  1094. vllm/model_executor/models/granitemoehybrid.py +709 -0
  1095. vllm/model_executor/models/granitemoeshared.py +333 -0
  1096. vllm/model_executor/models/gritlm.py +245 -0
  1097. vllm/model_executor/models/grok1.py +558 -0
  1098. vllm/model_executor/models/h2ovl.py +554 -0
  1099. vllm/model_executor/models/hunyuan_v1.py +1053 -0
  1100. vllm/model_executor/models/hyperclovax_vision.py +1166 -0
  1101. vllm/model_executor/models/idefics2_vision_model.py +426 -0
  1102. vllm/model_executor/models/idefics3.py +717 -0
  1103. vllm/model_executor/models/interfaces.py +1092 -0
  1104. vllm/model_executor/models/interfaces_base.py +214 -0
  1105. vllm/model_executor/models/intern_vit.py +453 -0
  1106. vllm/model_executor/models/internlm2.py +460 -0
  1107. vllm/model_executor/models/internlm2_ve.py +142 -0
  1108. vllm/model_executor/models/interns1.py +830 -0
  1109. vllm/model_executor/models/interns1_vit.py +432 -0
  1110. vllm/model_executor/models/internvl.py +1452 -0
  1111. vllm/model_executor/models/jais.py +397 -0
  1112. vllm/model_executor/models/jamba.py +610 -0
  1113. vllm/model_executor/models/jina_vl.py +147 -0
  1114. vllm/model_executor/models/keye.py +1761 -0
  1115. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1116. vllm/model_executor/models/kimi_linear.py +663 -0
  1117. vllm/model_executor/models/kimi_vl.py +578 -0
  1118. vllm/model_executor/models/lfm2.py +532 -0
  1119. vllm/model_executor/models/lfm2_moe.py +762 -0
  1120. vllm/model_executor/models/lightonocr.py +195 -0
  1121. vllm/model_executor/models/llama.py +732 -0
  1122. vllm/model_executor/models/llama4.py +859 -0
  1123. vllm/model_executor/models/llama4_eagle.py +223 -0
  1124. vllm/model_executor/models/llama_eagle.py +218 -0
  1125. vllm/model_executor/models/llama_eagle3.py +367 -0
  1126. vllm/model_executor/models/llava.py +842 -0
  1127. vllm/model_executor/models/llava_next.py +583 -0
  1128. vllm/model_executor/models/llava_next_video.py +467 -0
  1129. vllm/model_executor/models/llava_onevision.py +923 -0
  1130. vllm/model_executor/models/longcat_flash.py +749 -0
  1131. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1132. vllm/model_executor/models/mamba.py +276 -0
  1133. vllm/model_executor/models/mamba2.py +289 -0
  1134. vllm/model_executor/models/medusa.py +179 -0
  1135. vllm/model_executor/models/midashenglm.py +827 -0
  1136. vllm/model_executor/models/mimo.py +188 -0
  1137. vllm/model_executor/models/mimo_mtp.py +294 -0
  1138. vllm/model_executor/models/minicpm.py +664 -0
  1139. vllm/model_executor/models/minicpm3.py +242 -0
  1140. vllm/model_executor/models/minicpm_eagle.py +389 -0
  1141. vllm/model_executor/models/minicpmo.py +768 -0
  1142. vllm/model_executor/models/minicpmv.py +1745 -0
  1143. vllm/model_executor/models/minimax_m2.py +552 -0
  1144. vllm/model_executor/models/minimax_text_01.py +1012 -0
  1145. vllm/model_executor/models/minimax_vl_01.py +396 -0
  1146. vllm/model_executor/models/mistral3.py +637 -0
  1147. vllm/model_executor/models/mixtral.py +621 -0
  1148. vllm/model_executor/models/mllama4.py +1147 -0
  1149. vllm/model_executor/models/mlp_speculator.py +235 -0
  1150. vllm/model_executor/models/modernbert.py +450 -0
  1151. vllm/model_executor/models/module_mapping.py +74 -0
  1152. vllm/model_executor/models/molmo.py +1555 -0
  1153. vllm/model_executor/models/moonvit.py +677 -0
  1154. vllm/model_executor/models/mpt.py +335 -0
  1155. vllm/model_executor/models/nano_nemotron_vl.py +1740 -0
  1156. vllm/model_executor/models/nemotron.py +518 -0
  1157. vllm/model_executor/models/nemotron_h.py +852 -0
  1158. vllm/model_executor/models/nemotron_nas.py +491 -0
  1159. vllm/model_executor/models/nemotron_vl.py +653 -0
  1160. vllm/model_executor/models/nvlm_d.py +216 -0
  1161. vllm/model_executor/models/olmo.py +414 -0
  1162. vllm/model_executor/models/olmo2.py +454 -0
  1163. vllm/model_executor/models/olmoe.py +498 -0
  1164. vllm/model_executor/models/openpangu.py +1062 -0
  1165. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1166. vllm/model_executor/models/opt.py +426 -0
  1167. vllm/model_executor/models/orion.py +372 -0
  1168. vllm/model_executor/models/ouro.py +516 -0
  1169. vllm/model_executor/models/ovis.py +559 -0
  1170. vllm/model_executor/models/ovis2_5.py +673 -0
  1171. vllm/model_executor/models/paddleocr_vl.py +1407 -0
  1172. vllm/model_executor/models/paligemma.py +412 -0
  1173. vllm/model_executor/models/persimmon.py +377 -0
  1174. vllm/model_executor/models/phi.py +374 -0
  1175. vllm/model_executor/models/phi3.py +18 -0
  1176. vllm/model_executor/models/phi3v.py +737 -0
  1177. vllm/model_executor/models/phi4_multimodal.py +1447 -0
  1178. vllm/model_executor/models/phi4mm.py +1253 -0
  1179. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1180. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1181. vllm/model_executor/models/phimoe.py +675 -0
  1182. vllm/model_executor/models/pixtral.py +1352 -0
  1183. vllm/model_executor/models/plamo2.py +981 -0
  1184. vllm/model_executor/models/qwen.py +368 -0
  1185. vllm/model_executor/models/qwen2.py +541 -0
  1186. vllm/model_executor/models/qwen2_5_omni_thinker.py +1246 -0
  1187. vllm/model_executor/models/qwen2_5_vl.py +1613 -0
  1188. vllm/model_executor/models/qwen2_audio.py +473 -0
  1189. vllm/model_executor/models/qwen2_moe.py +596 -0
  1190. vllm/model_executor/models/qwen2_rm.py +123 -0
  1191. vllm/model_executor/models/qwen2_vl.py +1670 -0
  1192. vllm/model_executor/models/qwen3.py +336 -0
  1193. vllm/model_executor/models/qwen3_moe.py +744 -0
  1194. vllm/model_executor/models/qwen3_next.py +1395 -0
  1195. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1196. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1721 -0
  1197. vllm/model_executor/models/qwen3_vl.py +1673 -0
  1198. vllm/model_executor/models/qwen3_vl_moe.py +415 -0
  1199. vllm/model_executor/models/qwen_vl.py +802 -0
  1200. vllm/model_executor/models/radio.py +555 -0
  1201. vllm/model_executor/models/registry.py +1155 -0
  1202. vllm/model_executor/models/roberta.py +259 -0
  1203. vllm/model_executor/models/rvl.py +107 -0
  1204. vllm/model_executor/models/seed_oss.py +497 -0
  1205. vllm/model_executor/models/siglip.py +1174 -0
  1206. vllm/model_executor/models/siglip2navit.py +724 -0
  1207. vllm/model_executor/models/skyworkr1v.py +953 -0
  1208. vllm/model_executor/models/smolvlm.py +38 -0
  1209. vllm/model_executor/models/solar.py +502 -0
  1210. vllm/model_executor/models/stablelm.py +359 -0
  1211. vllm/model_executor/models/starcoder2.py +367 -0
  1212. vllm/model_executor/models/step3_text.py +559 -0
  1213. vllm/model_executor/models/step3_vl.py +1148 -0
  1214. vllm/model_executor/models/swin.py +514 -0
  1215. vllm/model_executor/models/tarsier.py +619 -0
  1216. vllm/model_executor/models/telechat2.py +153 -0
  1217. vllm/model_executor/models/teleflm.py +78 -0
  1218. vllm/model_executor/models/terratorch.py +319 -0
  1219. vllm/model_executor/models/transformers/__init__.py +127 -0
  1220. vllm/model_executor/models/transformers/base.py +464 -0
  1221. vllm/model_executor/models/transformers/causal.py +65 -0
  1222. vllm/model_executor/models/transformers/legacy.py +90 -0
  1223. vllm/model_executor/models/transformers/moe.py +318 -0
  1224. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1225. vllm/model_executor/models/transformers/pooling.py +119 -0
  1226. vllm/model_executor/models/transformers/utils.py +207 -0
  1227. vllm/model_executor/models/ultravox.py +681 -0
  1228. vllm/model_executor/models/utils.py +877 -0
  1229. vllm/model_executor/models/vision.py +552 -0
  1230. vllm/model_executor/models/voxtral.py +845 -0
  1231. vllm/model_executor/models/whisper.py +959 -0
  1232. vllm/model_executor/models/zamba2.py +986 -0
  1233. vllm/model_executor/parameter.py +642 -0
  1234. vllm/model_executor/utils.py +94 -0
  1235. vllm/model_executor/warmup/__init__.py +0 -0
  1236. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1237. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1238. vllm/multimodal/__init__.py +40 -0
  1239. vllm/multimodal/audio.py +118 -0
  1240. vllm/multimodal/base.py +26 -0
  1241. vllm/multimodal/cache.py +755 -0
  1242. vllm/multimodal/evs.py +294 -0
  1243. vllm/multimodal/hasher.py +106 -0
  1244. vllm/multimodal/image.py +130 -0
  1245. vllm/multimodal/inputs.py +1036 -0
  1246. vllm/multimodal/parse.py +544 -0
  1247. vllm/multimodal/processing.py +2186 -0
  1248. vllm/multimodal/profiling.py +369 -0
  1249. vllm/multimodal/registry.py +360 -0
  1250. vllm/multimodal/utils.py +512 -0
  1251. vllm/multimodal/video.py +306 -0
  1252. vllm/outputs.py +345 -0
  1253. vllm/platforms/__init__.py +277 -0
  1254. vllm/platforms/cpu.py +414 -0
  1255. vllm/platforms/cuda.py +657 -0
  1256. vllm/platforms/interface.py +639 -0
  1257. vllm/platforms/rocm.py +466 -0
  1258. vllm/platforms/tpu.py +276 -0
  1259. vllm/platforms/xpu.py +274 -0
  1260. vllm/plugins/__init__.py +78 -0
  1261. vllm/plugins/io_processors/__init__.py +68 -0
  1262. vllm/plugins/io_processors/interface.py +77 -0
  1263. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1264. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1265. vllm/pooling_params.py +228 -0
  1266. vllm/profiler/__init__.py +0 -0
  1267. vllm/profiler/gpu_profiler.py +37 -0
  1268. vllm/profiler/layerwise_profile.py +392 -0
  1269. vllm/profiler/utils.py +151 -0
  1270. vllm/py.typed +2 -0
  1271. vllm/ray/__init__.py +0 -0
  1272. vllm/ray/lazy_utils.py +26 -0
  1273. vllm/ray/ray_env.py +79 -0
  1274. vllm/reasoning/__init__.py +92 -0
  1275. vllm/reasoning/abs_reasoning_parsers.py +290 -0
  1276. vllm/reasoning/basic_parsers.py +162 -0
  1277. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1278. vllm/reasoning/deepseek_v3_reasoning_parser.py +62 -0
  1279. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1280. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1281. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1282. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1283. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1284. vllm/reasoning/identity_reasoning_parser.py +58 -0
  1285. vllm/reasoning/minimax_m2_reasoning_parser.py +67 -0
  1286. vllm/reasoning/mistral_reasoning_parser.py +55 -0
  1287. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1288. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1289. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1290. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1291. vllm/sampling_params.py +669 -0
  1292. vllm/scalar_type.py +355 -0
  1293. vllm/scripts.py +17 -0
  1294. vllm/sequence.py +98 -0
  1295. vllm/tasks.py +13 -0
  1296. vllm/third_party/__init__.py +0 -0
  1297. vllm/third_party/pynvml.py +6140 -0
  1298. vllm/tracing.py +135 -0
  1299. vllm/transformers_utils/__init__.py +26 -0
  1300. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1301. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1302. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1303. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1304. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1305. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1306. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1307. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1308. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1309. vllm/transformers_utils/config.py +1203 -0
  1310. vllm/transformers_utils/config_parser_base.py +20 -0
  1311. vllm/transformers_utils/configs/__init__.py +70 -0
  1312. vllm/transformers_utils/configs/afmoe.py +84 -0
  1313. vllm/transformers_utils/configs/arctic.py +206 -0
  1314. vllm/transformers_utils/configs/chatglm.py +75 -0
  1315. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1316. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1317. vllm/transformers_utils/configs/eagle.py +84 -0
  1318. vllm/transformers_utils/configs/falcon.py +89 -0
  1319. vllm/transformers_utils/configs/flex_olmo.py +77 -0
  1320. vllm/transformers_utils/configs/jais.py +243 -0
  1321. vllm/transformers_utils/configs/kimi_linear.py +144 -0
  1322. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1323. vllm/transformers_utils/configs/lfm2_moe.py +159 -0
  1324. vllm/transformers_utils/configs/medusa.py +65 -0
  1325. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1326. vllm/transformers_utils/configs/mistral.py +174 -0
  1327. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1328. vllm/transformers_utils/configs/moonvit.py +33 -0
  1329. vllm/transformers_utils/configs/nemotron.py +212 -0
  1330. vllm/transformers_utils/configs/nemotron_h.py +282 -0
  1331. vllm/transformers_utils/configs/olmo3.py +79 -0
  1332. vllm/transformers_utils/configs/ovis.py +182 -0
  1333. vllm/transformers_utils/configs/qwen3_next.py +274 -0
  1334. vllm/transformers_utils/configs/radio.py +89 -0
  1335. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1336. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1337. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1338. vllm/transformers_utils/configs/step3_vl.py +174 -0
  1339. vllm/transformers_utils/configs/ultravox.py +118 -0
  1340. vllm/transformers_utils/detokenizer_utils.py +198 -0
  1341. vllm/transformers_utils/dynamic_module.py +59 -0
  1342. vllm/transformers_utils/processor.py +402 -0
  1343. vllm/transformers_utils/processors/__init__.py +15 -0
  1344. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1345. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1346. vllm/transformers_utils/processors/ovis.py +453 -0
  1347. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1348. vllm/transformers_utils/runai_utils.py +104 -0
  1349. vllm/transformers_utils/s3_utils.py +95 -0
  1350. vllm/transformers_utils/tokenizer.py +293 -0
  1351. vllm/transformers_utils/tokenizer_base.py +155 -0
  1352. vllm/transformers_utils/tokenizers/__init__.py +16 -0
  1353. vllm/transformers_utils/tokenizers/mistral.py +502 -0
  1354. vllm/transformers_utils/utils.py +130 -0
  1355. vllm/triton_utils/__init__.py +19 -0
  1356. vllm/triton_utils/importing.py +103 -0
  1357. vllm/usage/__init__.py +0 -0
  1358. vllm/usage/usage_lib.py +294 -0
  1359. vllm/utils/__init__.py +82 -0
  1360. vllm/utils/argparse_utils.py +487 -0
  1361. vllm/utils/async_utils.py +303 -0
  1362. vllm/utils/cache.py +214 -0
  1363. vllm/utils/collection_utils.py +139 -0
  1364. vllm/utils/counter.py +45 -0
  1365. vllm/utils/deep_gemm.py +391 -0
  1366. vllm/utils/flashinfer.py +490 -0
  1367. vllm/utils/func_utils.py +236 -0
  1368. vllm/utils/gc_utils.py +147 -0
  1369. vllm/utils/hashing.py +63 -0
  1370. vllm/utils/import_utils.py +411 -0
  1371. vllm/utils/jsontree.py +165 -0
  1372. vllm/utils/math_utils.py +32 -0
  1373. vllm/utils/mem_constants.py +13 -0
  1374. vllm/utils/mem_utils.py +232 -0
  1375. vllm/utils/nccl.py +64 -0
  1376. vllm/utils/network_utils.py +331 -0
  1377. vllm/utils/platform_utils.py +59 -0
  1378. vllm/utils/profiling.py +56 -0
  1379. vllm/utils/registry.py +49 -0
  1380. vllm/utils/serial_utils.py +169 -0
  1381. vllm/utils/system_utils.py +229 -0
  1382. vllm/utils/tensor_schema.py +255 -0
  1383. vllm/utils/torch_utils.py +657 -0
  1384. vllm/v1/__init__.py +0 -0
  1385. vllm/v1/attention/__init__.py +0 -0
  1386. vllm/v1/attention/backends/__init__.py +0 -0
  1387. vllm/v1/attention/backends/cpu_attn.py +496 -0
  1388. vllm/v1/attention/backends/flash_attn.py +1028 -0
  1389. vllm/v1/attention/backends/flashinfer.py +1572 -0
  1390. vllm/v1/attention/backends/flex_attention.py +926 -0
  1391. vllm/v1/attention/backends/gdn_attn.py +387 -0
  1392. vllm/v1/attention/backends/linear_attn.py +74 -0
  1393. vllm/v1/attention/backends/mamba1_attn.py +165 -0
  1394. vllm/v1/attention/backends/mamba2_attn.py +354 -0
  1395. vllm/v1/attention/backends/mamba_attn.py +115 -0
  1396. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1397. vllm/v1/attention/backends/mla/common.py +2031 -0
  1398. vllm/v1/attention/backends/mla/cutlass_mla.py +275 -0
  1399. vllm/v1/attention/backends/mla/flashattn_mla.py +337 -0
  1400. vllm/v1/attention/backends/mla/flashinfer_mla.py +171 -0
  1401. vllm/v1/attention/backends/mla/flashmla.py +314 -0
  1402. vllm/v1/attention/backends/mla/flashmla_sparse.py +548 -0
  1403. vllm/v1/attention/backends/mla/indexer.py +362 -0
  1404. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +294 -0
  1405. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1406. vllm/v1/attention/backends/pallas.py +436 -0
  1407. vllm/v1/attention/backends/rocm_aiter_fa.py +816 -0
  1408. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +196 -0
  1409. vllm/v1/attention/backends/rocm_attn.py +362 -0
  1410. vllm/v1/attention/backends/short_conv_attn.py +105 -0
  1411. vllm/v1/attention/backends/tree_attn.py +425 -0
  1412. vllm/v1/attention/backends/triton_attn.py +373 -0
  1413. vllm/v1/attention/backends/utils.py +1116 -0
  1414. vllm/v1/attention/backends/xformers.py +417 -0
  1415. vllm/v1/core/__init__.py +0 -0
  1416. vllm/v1/core/block_pool.py +428 -0
  1417. vllm/v1/core/encoder_cache_manager.py +343 -0
  1418. vllm/v1/core/kv_cache_coordinator.py +480 -0
  1419. vllm/v1/core/kv_cache_manager.py +420 -0
  1420. vllm/v1/core/kv_cache_utils.py +1340 -0
  1421. vllm/v1/core/sched/__init__.py +0 -0
  1422. vllm/v1/core/sched/async_scheduler.py +62 -0
  1423. vllm/v1/core/sched/interface.py +181 -0
  1424. vllm/v1/core/sched/output.py +202 -0
  1425. vllm/v1/core/sched/request_queue.py +221 -0
  1426. vllm/v1/core/sched/scheduler.py +1617 -0
  1427. vllm/v1/core/sched/utils.py +72 -0
  1428. vllm/v1/core/single_type_kv_cache_manager.py +736 -0
  1429. vllm/v1/cudagraph_dispatcher.py +148 -0
  1430. vllm/v1/engine/__init__.py +206 -0
  1431. vllm/v1/engine/async_llm.py +797 -0
  1432. vllm/v1/engine/coordinator.py +377 -0
  1433. vllm/v1/engine/core.py +1420 -0
  1434. vllm/v1/engine/core_client.py +1400 -0
  1435. vllm/v1/engine/detokenizer.py +351 -0
  1436. vllm/v1/engine/exceptions.py +18 -0
  1437. vllm/v1/engine/llm_engine.py +408 -0
  1438. vllm/v1/engine/logprobs.py +182 -0
  1439. vllm/v1/engine/output_processor.py +642 -0
  1440. vllm/v1/engine/parallel_sampling.py +145 -0
  1441. vllm/v1/engine/processor.py +621 -0
  1442. vllm/v1/engine/utils.py +1072 -0
  1443. vllm/v1/executor/__init__.py +6 -0
  1444. vllm/v1/executor/abstract.py +352 -0
  1445. vllm/v1/executor/multiproc_executor.py +877 -0
  1446. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1447. vllm/v1/executor/ray_executor.py +626 -0
  1448. vllm/v1/executor/ray_utils.py +465 -0
  1449. vllm/v1/executor/uniproc_executor.py +183 -0
  1450. vllm/v1/kv_cache_interface.py +403 -0
  1451. vllm/v1/kv_offload/__init__.py +0 -0
  1452. vllm/v1/kv_offload/abstract.py +161 -0
  1453. vllm/v1/kv_offload/arc_manager.py +237 -0
  1454. vllm/v1/kv_offload/backend.py +97 -0
  1455. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1456. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1457. vllm/v1/kv_offload/cpu.py +93 -0
  1458. vllm/v1/kv_offload/factory.py +56 -0
  1459. vllm/v1/kv_offload/lru_manager.py +139 -0
  1460. vllm/v1/kv_offload/mediums.py +39 -0
  1461. vllm/v1/kv_offload/spec.py +62 -0
  1462. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1463. vllm/v1/kv_offload/worker/cpu_gpu.py +185 -0
  1464. vllm/v1/kv_offload/worker/worker.py +144 -0
  1465. vllm/v1/metrics/__init__.py +0 -0
  1466. vllm/v1/metrics/loggers.py +1238 -0
  1467. vllm/v1/metrics/prometheus.py +82 -0
  1468. vllm/v1/metrics/ray_wrappers.py +169 -0
  1469. vllm/v1/metrics/reader.py +257 -0
  1470. vllm/v1/metrics/stats.py +420 -0
  1471. vllm/v1/outputs.py +249 -0
  1472. vllm/v1/pool/__init__.py +0 -0
  1473. vllm/v1/pool/metadata.py +82 -0
  1474. vllm/v1/request.py +259 -0
  1475. vllm/v1/sample/__init__.py +0 -0
  1476. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1477. vllm/v1/sample/logits_processor/builtin.py +274 -0
  1478. vllm/v1/sample/logits_processor/interface.py +106 -0
  1479. vllm/v1/sample/logits_processor/state.py +165 -0
  1480. vllm/v1/sample/metadata.py +44 -0
  1481. vllm/v1/sample/ops/__init__.py +0 -0
  1482. vllm/v1/sample/ops/bad_words.py +52 -0
  1483. vllm/v1/sample/ops/logprobs.py +25 -0
  1484. vllm/v1/sample/ops/penalties.py +57 -0
  1485. vllm/v1/sample/ops/topk_topp_sampler.py +290 -0
  1486. vllm/v1/sample/rejection_sampler.py +793 -0
  1487. vllm/v1/sample/sampler.py +316 -0
  1488. vllm/v1/sample/tpu/__init__.py +0 -0
  1489. vllm/v1/sample/tpu/metadata.py +120 -0
  1490. vllm/v1/sample/tpu/sampler.py +215 -0
  1491. vllm/v1/serial_utils.py +532 -0
  1492. vllm/v1/spec_decode/__init__.py +0 -0
  1493. vllm/v1/spec_decode/eagle.py +1225 -0
  1494. vllm/v1/spec_decode/medusa.py +73 -0
  1495. vllm/v1/spec_decode/metadata.py +66 -0
  1496. vllm/v1/spec_decode/metrics.py +224 -0
  1497. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1498. vllm/v1/spec_decode/suffix_decoding.py +103 -0
  1499. vllm/v1/spec_decode/utils.py +16 -0
  1500. vllm/v1/structured_output/__init__.py +338 -0
  1501. vllm/v1/structured_output/backend_guidance.py +265 -0
  1502. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1503. vllm/v1/structured_output/backend_outlines.py +324 -0
  1504. vllm/v1/structured_output/backend_types.py +136 -0
  1505. vllm/v1/structured_output/backend_xgrammar.py +362 -0
  1506. vllm/v1/structured_output/request.py +94 -0
  1507. vllm/v1/structured_output/utils.py +469 -0
  1508. vllm/v1/utils.py +414 -0
  1509. vllm/v1/worker/__init__.py +0 -0
  1510. vllm/v1/worker/block_table.py +327 -0
  1511. vllm/v1/worker/cpu_model_runner.py +122 -0
  1512. vllm/v1/worker/cpu_worker.py +206 -0
  1513. vllm/v1/worker/dp_utils.py +230 -0
  1514. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1515. vllm/v1/worker/gpu_input_batch.py +975 -0
  1516. vllm/v1/worker/gpu_model_runner.py +5102 -0
  1517. vllm/v1/worker/gpu_ubatch_wrapper.py +466 -0
  1518. vllm/v1/worker/gpu_worker.py +894 -0
  1519. vllm/v1/worker/kv_connector_model_runner_mixin.py +144 -0
  1520. vllm/v1/worker/lora_model_runner_mixin.py +213 -0
  1521. vllm/v1/worker/tpu_input_batch.py +593 -0
  1522. vllm/v1/worker/tpu_model_runner.py +2173 -0
  1523. vllm/v1/worker/tpu_worker.py +355 -0
  1524. vllm/v1/worker/ubatch_utils.py +73 -0
  1525. vllm/v1/worker/ubatching.py +231 -0
  1526. vllm/v1/worker/utils.py +366 -0
  1527. vllm/v1/worker/worker_base.py +375 -0
  1528. vllm/v1/worker/xpu_model_runner.py +55 -0
  1529. vllm/v1/worker/xpu_worker.py +189 -0
  1530. vllm/version.py +39 -0
  1531. vllm/vllm_flash_attn/.gitkeep +0 -0
  1532. vllm_cpu_amxbf16-0.11.2.post2.dist-info/METADATA +345 -0
  1533. vllm_cpu_amxbf16-0.11.2.post2.dist-info/RECORD +1536 -0
  1534. vllm_cpu_amxbf16-0.11.2.post2.dist-info/WHEEL +5 -0
  1535. vllm_cpu_amxbf16-0.11.2.post2.dist-info/entry_points.txt +5 -0
  1536. vllm_cpu_amxbf16-0.11.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3299 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Adapted from
5
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
6
+ import json
7
+ import time
8
+ from http import HTTPStatus
9
+ from typing import Annotated, Any, ClassVar, Generic, Literal, TypeAlias, TypeVar
10
+
11
+ import regex as re
12
+ import torch
13
+ from fastapi import HTTPException, UploadFile
14
+ from openai.types.chat.chat_completion_audio import (
15
+ ChatCompletionAudio as OpenAIChatCompletionAudio,
16
+ )
17
+ from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
18
+ from openai.types.responses import (
19
+ ResponseCodeInterpreterCallCodeDeltaEvent,
20
+ ResponseCodeInterpreterCallCodeDoneEvent,
21
+ ResponseCodeInterpreterCallCompletedEvent,
22
+ ResponseCodeInterpreterCallInProgressEvent,
23
+ ResponseCodeInterpreterCallInterpretingEvent,
24
+ ResponseContentPartAddedEvent,
25
+ ResponseContentPartDoneEvent,
26
+ ResponseFunctionToolCall,
27
+ ResponseInputItemParam,
28
+ ResponseOutputItem,
29
+ ResponseOutputItemAddedEvent,
30
+ ResponseOutputItemDoneEvent,
31
+ ResponsePrompt,
32
+ ResponseReasoningItem,
33
+ ResponseReasoningTextDeltaEvent,
34
+ ResponseReasoningTextDoneEvent,
35
+ ResponseStatus,
36
+ ResponseWebSearchCallCompletedEvent,
37
+ ResponseWebSearchCallInProgressEvent,
38
+ ResponseWebSearchCallSearchingEvent,
39
+ )
40
+ from openai.types.responses import (
41
+ ResponseCompletedEvent as OpenAIResponseCompletedEvent,
42
+ )
43
+ from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreatedEvent
44
+ from openai.types.responses import (
45
+ ResponseInProgressEvent as OpenAIResponseInProgressEvent,
46
+ )
47
+ from openai.types.responses.response_reasoning_item import (
48
+ Content as ResponseReasoningTextContent,
49
+ )
50
+ from openai_harmony import Message as OpenAIHarmonyMessage
51
+
52
+ from vllm.config.pooler import get_use_activation
53
+ from vllm.tasks import PoolingTask
54
+ from vllm.utils.serial_utils import (
55
+ EmbedDType,
56
+ EncodingFormat,
57
+ Endianness,
58
+ )
59
+
60
+ # Backward compatibility for OpenAI client versions
61
+ try: # For older openai versions (< 1.100.0)
62
+ from openai.types.responses import ResponseTextConfig
63
+ except ImportError: # For newer openai versions (>= 1.100.0)
64
+ from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig
65
+
66
+
67
+ from openai.types.responses.response import IncompleteDetails, ToolChoice
68
+ from openai.types.responses.tool import Tool
69
+ from openai.types.shared import Metadata, Reasoning
70
+ from pydantic import (
71
+ BaseModel,
72
+ ConfigDict,
73
+ Field,
74
+ TypeAdapter,
75
+ ValidationError,
76
+ ValidationInfo,
77
+ field_serializer,
78
+ field_validator,
79
+ model_validator,
80
+ )
81
+
82
+ from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
83
+ from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
84
+ from vllm.logger import init_logger
85
+ from vllm.logprobs import Logprob
86
+ from vllm.pooling_params import PoolingParams
87
+ from vllm.sampling_params import (
88
+ BeamSearchParams,
89
+ RequestOutputKind,
90
+ SamplingParams,
91
+ StructuredOutputsParams,
92
+ )
93
+ from vllm.utils import random_uuid
94
+ from vllm.utils.import_utils import resolve_obj_by_qualname
95
+
96
+ logger = init_logger(__name__)
97
+
98
+ _LONG_INFO = torch.iinfo(torch.long)
99
+
100
+
101
+ class OpenAIBaseModel(BaseModel):
102
+ # OpenAI API does allow extra fields
103
+ model_config = ConfigDict(extra="allow")
104
+
105
+ # Cache class field names
106
+ field_names: ClassVar[set[str] | None] = None
107
+
108
+ @model_validator(mode="wrap")
109
+ @classmethod
110
+ def __log_extra_fields__(cls, data, handler):
111
+ result = handler(data)
112
+ if not isinstance(data, dict):
113
+ return result
114
+ field_names = cls.field_names
115
+ if field_names is None:
116
+ # Get all class field names and their potential aliases
117
+ field_names = set()
118
+ for field_name, field in cls.model_fields.items():
119
+ field_names.add(field_name)
120
+ if alias := getattr(field, "alias", None):
121
+ field_names.add(alias)
122
+ cls.field_names = field_names
123
+
124
+ # Compare against both field names and aliases
125
+ if any(k not in field_names for k in data):
126
+ logger.warning(
127
+ "The following fields were present in the request but ignored: %s",
128
+ data.keys() - field_names,
129
+ )
130
+ return result
131
+
132
+
133
+ class ErrorInfo(OpenAIBaseModel):
134
+ message: str
135
+ type: str
136
+ param: str | None = None
137
+ code: int
138
+
139
+
140
+ class ErrorResponse(OpenAIBaseModel):
141
+ error: ErrorInfo
142
+
143
+
144
+ class ModelPermission(OpenAIBaseModel):
145
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
146
+ object: str = "model_permission"
147
+ created: int = Field(default_factory=lambda: int(time.time()))
148
+ allow_create_engine: bool = False
149
+ allow_sampling: bool = True
150
+ allow_logprobs: bool = True
151
+ allow_search_indices: bool = False
152
+ allow_view: bool = True
153
+ allow_fine_tuning: bool = False
154
+ organization: str = "*"
155
+ group: str | None = None
156
+ is_blocking: bool = False
157
+
158
+
159
+ class ModelCard(OpenAIBaseModel):
160
+ id: str
161
+ object: str = "model"
162
+ created: int = Field(default_factory=lambda: int(time.time()))
163
+ owned_by: str = "vllm"
164
+ root: str | None = None
165
+ parent: str | None = None
166
+ max_model_len: int | None = None
167
+ permission: list[ModelPermission] = Field(default_factory=list)
168
+
169
+
170
+ class ModelList(OpenAIBaseModel):
171
+ object: str = "list"
172
+ data: list[ModelCard] = Field(default_factory=list)
173
+
174
+
175
+ class PromptTokenUsageInfo(OpenAIBaseModel):
176
+ cached_tokens: int | None = None
177
+
178
+
179
+ class UsageInfo(OpenAIBaseModel):
180
+ prompt_tokens: int = 0
181
+ total_tokens: int = 0
182
+ completion_tokens: int | None = 0
183
+ prompt_tokens_details: PromptTokenUsageInfo | None = None
184
+
185
+
186
+ class RequestResponseMetadata(BaseModel):
187
+ request_id: str
188
+ final_usage_info: UsageInfo | None = None
189
+
190
+
191
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
192
+ name: str
193
+ description: str | None = None
194
+ # schema is the field in openai but that causes conflicts with pydantic so
195
+ # instead use json_schema with an alias
196
+ json_schema: dict[str, Any] | None = Field(default=None, alias="schema")
197
+ strict: bool | None = None
198
+
199
+
200
+ class LegacyStructuralTag(OpenAIBaseModel):
201
+ begin: str
202
+ # schema is the field, but that causes conflicts with pydantic so
203
+ # instead use structural_tag_schema with an alias
204
+ structural_tag_schema: dict[str, Any] | None = Field(default=None, alias="schema")
205
+ end: str
206
+
207
+
208
+ class LegacyStructuralTagResponseFormat(OpenAIBaseModel):
209
+ type: Literal["structural_tag"]
210
+ structures: list[LegacyStructuralTag]
211
+ triggers: list[str]
212
+
213
+
214
+ class StructuralTagResponseFormat(OpenAIBaseModel):
215
+ type: Literal["structural_tag"]
216
+ format: Any
217
+
218
+
219
+ AnyStructuralTagResponseFormat: TypeAlias = (
220
+ LegacyStructuralTagResponseFormat | StructuralTagResponseFormat
221
+ )
222
+
223
+
224
+ class ResponseFormat(OpenAIBaseModel):
225
+ # type must be "json_schema", "json_object", or "text"
226
+ type: Literal["text", "json_object", "json_schema"]
227
+ json_schema: JsonSchemaResponseFormat | None = None
228
+
229
+
230
+ AnyResponseFormat: TypeAlias = (
231
+ ResponseFormat | StructuralTagResponseFormat | LegacyStructuralTagResponseFormat
232
+ )
233
+
234
+
235
+ class StreamOptions(OpenAIBaseModel):
236
+ include_usage: bool | None = True
237
+ continuous_usage_stats: bool | None = False
238
+
239
+
240
+ class FunctionDefinition(OpenAIBaseModel):
241
+ name: str
242
+ description: str | None = None
243
+ parameters: dict[str, Any] | None = None
244
+
245
+
246
+ class ChatCompletionToolsParam(OpenAIBaseModel):
247
+ type: Literal["function"] = "function"
248
+ function: FunctionDefinition
249
+
250
+
251
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
252
+ name: str
253
+
254
+
255
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
256
+ function: ChatCompletionNamedFunction
257
+ type: Literal["function"] = "function"
258
+
259
+
260
+ # extra="forbid" is a workaround to have kwargs as a field,
261
+ # see https://github.com/pydantic/pydantic/issues/3125
262
+ class LogitsProcessorConstructor(BaseModel):
263
+ qualname: str
264
+ args: list[Any] | None = None
265
+ kwargs: dict[str, Any] | None = None
266
+
267
+ model_config = ConfigDict(extra="forbid")
268
+
269
+
270
+ LogitsProcessors = list[str | LogitsProcessorConstructor]
271
+
272
+
273
+ def get_logits_processors(
274
+ processors: LogitsProcessors | None, pattern: str | None
275
+ ) -> list[Any] | None:
276
+ if processors and pattern:
277
+ logits_processors = []
278
+ for processor in processors:
279
+ qualname = processor if isinstance(processor, str) else processor.qualname
280
+ if not re.match(pattern, qualname):
281
+ raise ValueError(
282
+ f"Logits processor '{qualname}' is not allowed by this "
283
+ "server. See --logits-processor-pattern engine argument "
284
+ "for more information."
285
+ )
286
+ try:
287
+ logits_processor = resolve_obj_by_qualname(qualname)
288
+ except Exception as e:
289
+ raise ValueError(
290
+ f"Logits processor '{qualname}' could not be resolved: {e}"
291
+ ) from e
292
+ if isinstance(processor, LogitsProcessorConstructor):
293
+ logits_processor = logits_processor(
294
+ *processor.args or [], **processor.kwargs or {}
295
+ )
296
+ logits_processors.append(logits_processor)
297
+ return logits_processors
298
+ elif processors:
299
+ raise ValueError(
300
+ "The `logits_processors` argument is not supported by this "
301
+ "server. See --logits-processor-pattern engine argument "
302
+ "for more information."
303
+ )
304
+ return None
305
+
306
+
307
+ ResponseInputOutputItem: TypeAlias = (
308
+ ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
309
+ )
310
+
311
+
312
+ class ResponsesRequest(OpenAIBaseModel):
313
+ # Ordered by official OpenAI API documentation
314
+ # https://platform.openai.com/docs/api-reference/responses/create
315
+ background: bool | None = False
316
+ include: (
317
+ list[
318
+ Literal[
319
+ "code_interpreter_call.outputs",
320
+ "computer_call_output.output.image_url",
321
+ "file_search_call.results",
322
+ "message.input_image.image_url",
323
+ "message.output_text.logprobs",
324
+ "reasoning.encrypted_content",
325
+ ],
326
+ ]
327
+ | None
328
+ ) = None
329
+ input: str | list[ResponseInputOutputItem]
330
+ instructions: str | None = None
331
+ max_output_tokens: int | None = None
332
+ max_tool_calls: int | None = None
333
+ metadata: Metadata | None = None
334
+ model: str | None = None
335
+ parallel_tool_calls: bool | None = True
336
+ previous_response_id: str | None = None
337
+ prompt: ResponsePrompt | None = None
338
+ reasoning: Reasoning | None = None
339
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
340
+ store: bool | None = True
341
+ stream: bool | None = False
342
+ temperature: float | None = None
343
+ text: ResponseTextConfig | None = None
344
+ tool_choice: ToolChoice = "auto"
345
+ tools: list[Tool] = Field(default_factory=list)
346
+ top_logprobs: int | None = 0
347
+ top_p: float | None = None
348
+ truncation: Literal["auto", "disabled"] | None = "disabled"
349
+ user: str | None = None
350
+
351
+ # --8<-- [start:responses-extra-params]
352
+ request_id: str = Field(
353
+ default_factory=lambda: f"resp_{random_uuid()}",
354
+ description=(
355
+ "The request_id related to this request. If the caller does "
356
+ "not set it, a random_uuid will be generated. This id is used "
357
+ "through out the inference process and return in response."
358
+ ),
359
+ )
360
+ mm_processor_kwargs: dict[str, Any] | None = Field(
361
+ default=None,
362
+ description=("Additional kwargs to pass to the HF processor."),
363
+ )
364
+ priority: int = Field(
365
+ default=0,
366
+ description=(
367
+ "The priority of the request (lower means earlier handling; "
368
+ "default: 0). Any priority other than 0 will raise an error "
369
+ "if the served model does not use priority scheduling."
370
+ ),
371
+ )
372
+ cache_salt: str | None = Field(
373
+ default=None,
374
+ description=(
375
+ "If specified, the prefix cache will be salted with the provided "
376
+ "string to prevent an attacker to guess prompts in multi-user "
377
+ "environments. The salt should be random, protected from "
378
+ "access by 3rd parties, and long enough to be "
379
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
380
+ "to 256 bit). Not supported by vLLM engine V0."
381
+ ),
382
+ )
383
+
384
+ enable_response_messages: bool = Field(
385
+ default=False,
386
+ description=(
387
+ "Dictates whether or not to return messages as part of the "
388
+ "response object. Currently only supported for"
389
+ "non-background and gpt-oss only. "
390
+ ),
391
+ )
392
+ # similar to input_messages / output_messages in ResponsesResponse
393
+ # we take in previous_input_messages (ie in harmony format)
394
+ # this cannot be used in conjunction with previous_response_id
395
+ # TODO: consider supporting non harmony messages as well
396
+ previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None
397
+ # --8<-- [end:responses-extra-params]
398
+
399
+ _DEFAULT_SAMPLING_PARAMS = {
400
+ "temperature": 1.0,
401
+ "top_p": 1.0,
402
+ }
403
+
404
+ def to_sampling_params(
405
+ self,
406
+ default_max_tokens: int,
407
+ default_sampling_params: dict | None = None,
408
+ ) -> SamplingParams:
409
+ if self.max_output_tokens is None:
410
+ max_tokens = default_max_tokens
411
+ else:
412
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
413
+
414
+ default_sampling_params = default_sampling_params or {}
415
+ if (temperature := self.temperature) is None:
416
+ temperature = default_sampling_params.get(
417
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
418
+ )
419
+ if (top_p := self.top_p) is None:
420
+ top_p = default_sampling_params.get(
421
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
422
+ )
423
+ stop_token_ids = default_sampling_params.get("stop_token_ids")
424
+
425
+ # Structured output
426
+ structured_outputs = None
427
+ if self.text is not None and self.text.format is not None:
428
+ response_format = self.text.format
429
+ if (
430
+ response_format.type == "json_schema"
431
+ and response_format.schema_ is not None
432
+ ):
433
+ structured_outputs = StructuredOutputsParams(
434
+ json=response_format.schema_
435
+ )
436
+ elif response_format.type == "json_object":
437
+ raise NotImplementedError("json_object is not supported")
438
+
439
+ # TODO: add more parameters
440
+ return SamplingParams.from_optional(
441
+ temperature=temperature,
442
+ top_p=top_p,
443
+ max_tokens=max_tokens,
444
+ logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
445
+ stop_token_ids=stop_token_ids,
446
+ output_kind=(
447
+ RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
448
+ ),
449
+ structured_outputs=structured_outputs,
450
+ )
451
+
452
+ def is_include_output_logprobs(self) -> bool:
453
+ """Check if the request includes output logprobs."""
454
+ if self.include is None:
455
+ return False
456
+ return (
457
+ isinstance(self.include, list)
458
+ and "message.output_text.logprobs" in self.include
459
+ )
460
+
461
+ @model_validator(mode="before")
462
+ def validate_background(cls, data):
463
+ if not data.get("background"):
464
+ return data
465
+ if not data.get("store", True):
466
+ raise ValueError("background can only be used when `store` is true")
467
+ return data
468
+
469
+ @model_validator(mode="before")
470
+ def validate_prompt(cls, data):
471
+ if data.get("prompt") is not None:
472
+ raise ValueError("prompt template is not supported")
473
+ return data
474
+
475
+ @model_validator(mode="before")
476
+ def check_cache_salt_support(cls, data):
477
+ if data.get("cache_salt") is not None and (
478
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
479
+ ):
480
+ raise ValueError(
481
+ "Parameter 'cache_salt' must be a non-empty string if provided."
482
+ )
483
+ return data
484
+
485
+ @model_validator(mode="before")
486
+ def function_call_parsing(cls, data):
487
+ """Parse function_call dictionaries into ResponseFunctionToolCall objects.
488
+ This ensures Pydantic can properly resolve union types in the input field.
489
+ Function calls provided as dicts are converted to ResponseFunctionToolCall
490
+ objects before validation, while invalid structures are left for Pydantic
491
+ to reject with appropriate error messages.
492
+ """
493
+
494
+ input_data = data.get("input")
495
+
496
+ # Early return for None, strings, or bytes
497
+ # (strings are iterable but shouldn't be processed)
498
+ if input_data is None or isinstance(input_data, (str, bytes)):
499
+ return data
500
+
501
+ # Convert iterators (like ValidatorIterator) to list
502
+ if not isinstance(input_data, list):
503
+ try:
504
+ input_data = list(input_data)
505
+ except TypeError:
506
+ # Not iterable, leave as-is for Pydantic to handle
507
+ return data
508
+
509
+ processed_input = []
510
+ for item in input_data:
511
+ if isinstance(item, dict) and item.get("type") == "function_call":
512
+ try:
513
+ processed_input.append(ResponseFunctionToolCall(**item))
514
+ except ValidationError:
515
+ # Let Pydantic handle validation for malformed function calls
516
+ logger.debug(
517
+ "Failed to parse function_call to ResponseFunctionToolCall, "
518
+ "leaving for Pydantic validation"
519
+ )
520
+ processed_input.append(item)
521
+ else:
522
+ processed_input.append(item)
523
+
524
+ data["input"] = processed_input
525
+ return data
526
+
527
+
528
+ class ChatCompletionRequest(OpenAIBaseModel):
529
+ # Ordered by official OpenAI API documentation
530
+ # https://platform.openai.com/docs/api-reference/chat/create
531
+ messages: list[ChatCompletionMessageParam]
532
+ model: str | None = None
533
+ frequency_penalty: float | None = 0.0
534
+ logit_bias: dict[str, float] | None = None
535
+ logprobs: bool | None = False
536
+ top_logprobs: int | None = 0
537
+ max_tokens: int | None = Field(
538
+ default=None,
539
+ deprecated="max_tokens is deprecated in favor of "
540
+ "the max_completion_tokens field",
541
+ )
542
+ max_completion_tokens: int | None = None
543
+ n: int | None = 1
544
+ presence_penalty: float | None = 0.0
545
+ response_format: AnyResponseFormat | None = None
546
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
547
+ stop: str | list[str] | None = []
548
+ stream: bool | None = False
549
+ stream_options: StreamOptions | None = None
550
+ temperature: float | None = None
551
+ top_p: float | None = None
552
+ tools: list[ChatCompletionToolsParam] | None = None
553
+ tool_choice: (
554
+ Literal["none"]
555
+ | Literal["auto"]
556
+ | Literal["required"]
557
+ | ChatCompletionNamedToolChoiceParam
558
+ | None
559
+ ) = "none"
560
+ reasoning_effort: Literal["low", "medium", "high"] | None = None
561
+ include_reasoning: bool = True
562
+
563
+ # NOTE this will be ignored by vLLM -- the model determines the behavior
564
+ parallel_tool_calls: bool | None = False
565
+ user: str | None = None
566
+
567
+ # --8<-- [start:chat-completion-sampling-params]
568
+ best_of: int | None = None
569
+ use_beam_search: bool = False
570
+ top_k: int | None = None
571
+ min_p: float | None = None
572
+ repetition_penalty: float | None = None
573
+ length_penalty: float = 1.0
574
+ stop_token_ids: list[int] | None = []
575
+ include_stop_str_in_output: bool = False
576
+ ignore_eos: bool = False
577
+ min_tokens: int = 0
578
+ skip_special_tokens: bool = True
579
+ spaces_between_special_tokens: bool = True
580
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
581
+ prompt_logprobs: int | None = None
582
+ allowed_token_ids: list[int] | None = None
583
+ bad_words: list[str] = Field(default_factory=list)
584
+ # --8<-- [end:chat-completion-sampling-params]
585
+
586
+ # --8<-- [start:chat-completion-extra-params]
587
+ echo: bool = Field(
588
+ default=False,
589
+ description=(
590
+ "If true, the new message will be prepended with the last message "
591
+ "if they belong to the same role."
592
+ ),
593
+ )
594
+ add_generation_prompt: bool = Field(
595
+ default=True,
596
+ description=(
597
+ "If true, the generation prompt will be added to the chat template. "
598
+ "This is a parameter used by chat template in tokenizer config of the "
599
+ "model."
600
+ ),
601
+ )
602
+ continue_final_message: bool = Field(
603
+ default=False,
604
+ description=(
605
+ "If this is set, the chat will be formatted so that the final "
606
+ "message in the chat is open-ended, without any EOS tokens. The "
607
+ "model will continue this message rather than starting a new one. "
608
+ 'This allows you to "prefill" part of the model\'s response for it. '
609
+ "Cannot be used at the same time as `add_generation_prompt`."
610
+ ),
611
+ )
612
+ add_special_tokens: bool = Field(
613
+ default=False,
614
+ description=(
615
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
616
+ "on top of what is added by the chat template. "
617
+ "For most models, the chat template takes care of adding the "
618
+ "special tokens so this should be set to false (as is the "
619
+ "default)."
620
+ ),
621
+ )
622
+ documents: list[dict[str, str]] | None = Field(
623
+ default=None,
624
+ description=(
625
+ "A list of dicts representing documents that will be accessible to "
626
+ "the model if it is performing RAG (retrieval-augmented generation)."
627
+ " If the template does not support RAG, this argument will have no "
628
+ "effect. We recommend that each document should be a dict containing "
629
+ '"title" and "text" keys.'
630
+ ),
631
+ )
632
+ chat_template: str | None = Field(
633
+ default=None,
634
+ description=(
635
+ "A Jinja template to use for this conversion. "
636
+ "As of transformers v4.44, default chat template is no longer "
637
+ "allowed, so you must provide a chat template if the tokenizer "
638
+ "does not define one."
639
+ ),
640
+ )
641
+ chat_template_kwargs: dict[str, Any] | None = Field(
642
+ default=None,
643
+ description=(
644
+ "Additional keyword args to pass to the template renderer. "
645
+ "Will be accessible by the chat template."
646
+ ),
647
+ )
648
+ mm_processor_kwargs: dict[str, Any] | None = Field(
649
+ default=None,
650
+ description=("Additional kwargs to pass to the HF processor."),
651
+ )
652
+ structured_outputs: StructuredOutputsParams | None = Field(
653
+ default=None,
654
+ description="Additional kwargs for structured outputs",
655
+ )
656
+ guided_json: str | dict | BaseModel | None = Field(
657
+ default=None,
658
+ description=(
659
+ "`guided_json` is deprecated. "
660
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
661
+ "Please pass `json` to `structured_outputs` instead."
662
+ ),
663
+ )
664
+ guided_regex: str | None = Field(
665
+ default=None,
666
+ description=(
667
+ "`guided_regex` is deprecated. "
668
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
669
+ "Please pass `regex` to `structured_outputs` instead."
670
+ ),
671
+ )
672
+ guided_choice: list[str] | None = Field(
673
+ default=None,
674
+ description=(
675
+ "`guided_choice` is deprecated. "
676
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
677
+ "Please pass `choice` to `structured_outputs` instead."
678
+ ),
679
+ )
680
+ guided_grammar: str | None = Field(
681
+ default=None,
682
+ description=(
683
+ "`guided_grammar` is deprecated. "
684
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
685
+ "Please pass `grammar` to `structured_outputs` instead."
686
+ ),
687
+ )
688
+ structural_tag: str | None = Field(
689
+ default=None,
690
+ description=(
691
+ "`structural_tag` is deprecated. "
692
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
693
+ "Please pass `structural_tag` to `structured_outputs` instead."
694
+ ),
695
+ )
696
+ guided_decoding_backend: str | None = Field(
697
+ default=None,
698
+ description=(
699
+ "`guided_decoding_backend` is deprecated. "
700
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
701
+ "Please remove it from your request."
702
+ ),
703
+ )
704
+ guided_whitespace_pattern: str | None = Field(
705
+ default=None,
706
+ description=(
707
+ "`guided_whitespace_pattern` is deprecated. "
708
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
709
+ "Please pass `whitespace_pattern` to `structured_outputs` instead."
710
+ ),
711
+ )
712
+ priority: int = Field(
713
+ default=0,
714
+ description=(
715
+ "The priority of the request (lower means earlier handling; "
716
+ "default: 0). Any priority other than 0 will raise an error "
717
+ "if the served model does not use priority scheduling."
718
+ ),
719
+ )
720
+ request_id: str = Field(
721
+ default_factory=lambda: f"{random_uuid()}",
722
+ description=(
723
+ "The request_id related to this request. If the caller does "
724
+ "not set it, a random_uuid will be generated. This id is used "
725
+ "through out the inference process and return in response."
726
+ ),
727
+ )
728
+ logits_processors: LogitsProcessors | None = Field(
729
+ default=None,
730
+ description=(
731
+ "A list of either qualified names of logits processors, or "
732
+ "constructor objects, to apply when sampling. A constructor is "
733
+ "a JSON object with a required 'qualname' field specifying the "
734
+ "qualified name of the processor class/factory, and optional "
735
+ "'args' and 'kwargs' fields containing positional and keyword "
736
+ "arguments. For example: {'qualname': "
737
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
738
+ "{'param': 'value'}}."
739
+ ),
740
+ )
741
+ return_tokens_as_token_ids: bool | None = Field(
742
+ default=None,
743
+ description=(
744
+ "If specified with 'logprobs', tokens are represented "
745
+ " as strings of the form 'token_id:{token_id}' so that tokens "
746
+ "that are not JSON-encodable can be identified."
747
+ ),
748
+ )
749
+ return_token_ids: bool | None = Field(
750
+ default=None,
751
+ description=(
752
+ "If specified, the result will include token IDs alongside the "
753
+ "generated text. In streaming mode, prompt_token_ids is included "
754
+ "only in the first chunk, and token_ids contains the delta tokens "
755
+ "for each chunk. This is useful for debugging or when you "
756
+ "need to map generated text back to input tokens."
757
+ ),
758
+ )
759
+ cache_salt: str | None = Field(
760
+ default=None,
761
+ description=(
762
+ "If specified, the prefix cache will be salted with the provided "
763
+ "string to prevent an attacker to guess prompts in multi-user "
764
+ "environments. The salt should be random, protected from "
765
+ "access by 3rd parties, and long enough to be "
766
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
767
+ "to 256 bit). Not supported by vLLM engine V0."
768
+ ),
769
+ )
770
+ kv_transfer_params: dict[str, Any] | None = Field(
771
+ default=None,
772
+ description="KVTransfer parameters used for disaggregated serving.",
773
+ )
774
+
775
+ vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
776
+ default=None,
777
+ description=(
778
+ "Additional request parameters with (list of) string or "
779
+ "numeric values, used by custom extensions."
780
+ ),
781
+ )
782
+
783
+ # --8<-- [end:chat-completion-extra-params]
784
+
785
+ # Default sampling parameters for chat completion requests
786
+ _DEFAULT_SAMPLING_PARAMS: dict = {
787
+ "repetition_penalty": 1.0,
788
+ "temperature": 1.0,
789
+ "top_p": 1.0,
790
+ "top_k": 0,
791
+ "min_p": 0.0,
792
+ }
793
+
794
+ def to_beam_search_params(
795
+ self, max_tokens: int, default_sampling_params: dict
796
+ ) -> BeamSearchParams:
797
+ n = self.n if self.n is not None else 1
798
+ if (temperature := self.temperature) is None:
799
+ temperature = default_sampling_params.get(
800
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
801
+ )
802
+
803
+ return BeamSearchParams(
804
+ beam_width=n,
805
+ max_tokens=max_tokens,
806
+ ignore_eos=self.ignore_eos,
807
+ temperature=temperature,
808
+ length_penalty=self.length_penalty,
809
+ include_stop_str_in_output=self.include_stop_str_in_output,
810
+ )
811
+
812
+ def to_sampling_params(
813
+ self,
814
+ max_tokens: int,
815
+ logits_processor_pattern: str | None,
816
+ default_sampling_params: dict,
817
+ ) -> SamplingParams:
818
+ # Default parameters
819
+ if (repetition_penalty := self.repetition_penalty) is None:
820
+ repetition_penalty = default_sampling_params.get(
821
+ "repetition_penalty",
822
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
823
+ )
824
+ if (temperature := self.temperature) is None:
825
+ temperature = default_sampling_params.get(
826
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
827
+ )
828
+ if (top_p := self.top_p) is None:
829
+ top_p = default_sampling_params.get(
830
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
831
+ )
832
+ if (top_k := self.top_k) is None:
833
+ top_k = default_sampling_params.get(
834
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
835
+ )
836
+ if (min_p := self.min_p) is None:
837
+ min_p = default_sampling_params.get(
838
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
839
+ )
840
+
841
+ prompt_logprobs = self.prompt_logprobs
842
+ if prompt_logprobs is None and self.echo:
843
+ prompt_logprobs = self.top_logprobs
844
+
845
+ # Forward deprecated guided_* parameters to structured_outputs
846
+ if self.structured_outputs is None:
847
+ kwargs = dict[str, Any](
848
+ json=self.guided_json,
849
+ regex=self.guided_regex,
850
+ choice=self.guided_choice,
851
+ grammar=self.guided_grammar,
852
+ whitespace_pattern=self.guided_whitespace_pattern,
853
+ structural_tag=self.structural_tag,
854
+ )
855
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
856
+ if len(kwargs) > 0:
857
+ self.structured_outputs = StructuredOutputsParams(**kwargs)
858
+
859
+ response_format = self.response_format
860
+ if response_format is not None:
861
+ # If structured outputs wasn't already enabled,
862
+ # we must enable it for these features to work
863
+ if self.structured_outputs is None:
864
+ self.structured_outputs = StructuredOutputsParams()
865
+
866
+ # Set structured output params for response format
867
+ if response_format is not None:
868
+ if response_format.type == "json_object":
869
+ self.structured_outputs.json_object = True
870
+ elif response_format.type == "json_schema":
871
+ json_schema = response_format.json_schema
872
+ assert json_schema is not None
873
+ self.structured_outputs.json = json_schema.json_schema
874
+ elif response_format.type == "structural_tag":
875
+ structural_tag = response_format
876
+ assert structural_tag is not None and isinstance(
877
+ structural_tag,
878
+ (
879
+ LegacyStructuralTagResponseFormat,
880
+ StructuralTagResponseFormat,
881
+ ),
882
+ )
883
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
884
+ self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
885
+
886
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
887
+ if self.kv_transfer_params:
888
+ # Pass in kv_transfer_params via extra_args
889
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
890
+ return SamplingParams.from_optional(
891
+ n=self.n,
892
+ best_of=self.best_of,
893
+ presence_penalty=self.presence_penalty,
894
+ frequency_penalty=self.frequency_penalty,
895
+ repetition_penalty=repetition_penalty,
896
+ temperature=temperature,
897
+ top_p=top_p,
898
+ top_k=top_k,
899
+ min_p=min_p,
900
+ seed=self.seed,
901
+ stop=self.stop,
902
+ stop_token_ids=self.stop_token_ids,
903
+ logprobs=self.top_logprobs if self.logprobs else None,
904
+ prompt_logprobs=prompt_logprobs,
905
+ ignore_eos=self.ignore_eos,
906
+ max_tokens=max_tokens,
907
+ min_tokens=self.min_tokens,
908
+ skip_special_tokens=self.skip_special_tokens,
909
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
910
+ logits_processors=get_logits_processors(
911
+ self.logits_processors, logits_processor_pattern
912
+ ),
913
+ include_stop_str_in_output=self.include_stop_str_in_output,
914
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
915
+ output_kind=RequestOutputKind.DELTA
916
+ if self.stream
917
+ else RequestOutputKind.FINAL_ONLY,
918
+ structured_outputs=self.structured_outputs,
919
+ logit_bias=self.logit_bias,
920
+ bad_words=self.bad_words,
921
+ allowed_token_ids=self.allowed_token_ids,
922
+ extra_args=extra_args or None,
923
+ )
924
+
925
+ @model_validator(mode="before")
926
+ @classmethod
927
+ def validate_stream_options(cls, data):
928
+ if data.get("stream_options") and not data.get("stream"):
929
+ raise ValueError("Stream options can only be defined when `stream=True`.")
930
+
931
+ return data
932
+
933
+ @model_validator(mode="before")
934
+ @classmethod
935
+ def check_logprobs(cls, data):
936
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
937
+ if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
938
+ raise ValueError(
939
+ "`prompt_logprobs` are not available when `stream=True`."
940
+ )
941
+
942
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
943
+ raise ValueError("`prompt_logprobs` must be a positive value or -1.")
944
+ if (top_logprobs := data.get("top_logprobs")) is not None:
945
+ if top_logprobs < 0 and top_logprobs != -1:
946
+ raise ValueError("`top_logprobs` must be a positive value or -1.")
947
+
948
+ if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
949
+ raise ValueError(
950
+ "when using `top_logprobs`, `logprobs` must be set to true."
951
+ )
952
+
953
+ return data
954
+
955
+ @model_validator(mode="before")
956
+ @classmethod
957
+ def check_structured_outputs_count(cls, data):
958
+ if isinstance(data, ValueError):
959
+ raise data
960
+
961
+ if data.get("structured_outputs", None) is None:
962
+ return data
963
+
964
+ structured_outputs_kwargs = data["structured_outputs"]
965
+ count = sum(
966
+ structured_outputs_kwargs.get(k) is not None
967
+ for k in ("json", "regex", "choice")
968
+ )
969
+ # you can only use one kind of constraints for structured outputs
970
+ if count > 1:
971
+ raise ValueError(
972
+ "You can only use one kind of constraints for structured "
973
+ "outputs ('json', 'regex' or 'choice')."
974
+ )
975
+ # you can only either use structured outputs or tools, not both
976
+ if count > 1 and data.get("tool_choice", "none") not in (
977
+ "none",
978
+ "auto",
979
+ "required",
980
+ ):
981
+ raise ValueError(
982
+ "You can only either use constraints for structured outputs "
983
+ "or tools, not both."
984
+ )
985
+ return data
986
+
987
+ @model_validator(mode="before")
988
+ @classmethod
989
+ def check_tool_usage(cls, data):
990
+ # if "tool_choice" is not specified but tools are provided,
991
+ # default to "auto" tool_choice
992
+ if "tool_choice" not in data and data.get("tools"):
993
+ data["tool_choice"] = "auto"
994
+
995
+ # if "tool_choice" is "none" -- no validation is needed for tools
996
+ if "tool_choice" in data and data["tool_choice"] == "none":
997
+ return data
998
+
999
+ # if "tool_choice" is specified -- validation
1000
+ if "tool_choice" in data and data["tool_choice"] is not None:
1001
+ # ensure that if "tool choice" is specified, tools are present
1002
+ if "tools" not in data or data["tools"] is None:
1003
+ raise ValueError("When using `tool_choice`, `tools` must be set.")
1004
+
1005
+ # make sure that tool choice is either a named tool
1006
+ # OR that it's set to "auto" or "required"
1007
+ if data["tool_choice"] not in ["auto", "required"] and not isinstance(
1008
+ data["tool_choice"], dict
1009
+ ):
1010
+ raise ValueError(
1011
+ f"Invalid value for `tool_choice`: {data['tool_choice']}! "
1012
+ 'Only named tools, "none", "auto" or "required" '
1013
+ "are supported."
1014
+ )
1015
+
1016
+ # if tool_choice is "required" but the "tools" list is empty,
1017
+ # override the data to behave like "none" to align with
1018
+ # OpenAI’s behavior.
1019
+ if (
1020
+ data["tool_choice"] == "required"
1021
+ and isinstance(data["tools"], list)
1022
+ and len(data["tools"]) == 0
1023
+ ):
1024
+ data["tool_choice"] = "none"
1025
+ del data["tools"]
1026
+ return data
1027
+
1028
+ # ensure that if "tool_choice" is specified as an object,
1029
+ # it matches a valid tool
1030
+ correct_usage_message = (
1031
+ 'Correct usage: `{"type": "function",'
1032
+ ' "function": {"name": "my_function"}}`'
1033
+ )
1034
+ if isinstance(data["tool_choice"], dict):
1035
+ valid_tool = False
1036
+ function = data["tool_choice"].get("function")
1037
+ if not isinstance(function, dict):
1038
+ raise ValueError(
1039
+ f"Invalid value for `function`: `{function}` in "
1040
+ f"`tool_choice`! {correct_usage_message}"
1041
+ )
1042
+ if "name" not in function:
1043
+ raise ValueError(
1044
+ f"Expected field `name` in `function` in "
1045
+ f"`tool_choice`! {correct_usage_message}"
1046
+ )
1047
+ function_name = function["name"]
1048
+ if not isinstance(function_name, str) or len(function_name) == 0:
1049
+ raise ValueError(
1050
+ f"Invalid `name` in `function`: `{function_name}`"
1051
+ f" in `tool_choice`! {correct_usage_message}"
1052
+ )
1053
+ for tool in data["tools"]:
1054
+ if tool["function"]["name"] == function_name:
1055
+ valid_tool = True
1056
+ break
1057
+ if not valid_tool:
1058
+ raise ValueError(
1059
+ "The tool specified in `tool_choice` does not match any"
1060
+ " of the specified `tools`"
1061
+ )
1062
+ return data
1063
+
1064
+ @model_validator(mode="before")
1065
+ @classmethod
1066
+ def check_generation_prompt(cls, data):
1067
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
1068
+ raise ValueError(
1069
+ "Cannot set both `continue_final_message` and "
1070
+ "`add_generation_prompt` to True."
1071
+ )
1072
+ return data
1073
+
1074
+ @model_validator(mode="before")
1075
+ @classmethod
1076
+ def check_cache_salt_support(cls, data):
1077
+ if data.get("cache_salt") is not None and (
1078
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
1079
+ ):
1080
+ raise ValueError(
1081
+ "Parameter 'cache_salt' must be a non-empty string if provided."
1082
+ )
1083
+ return data
1084
+
1085
+
1086
+ class CompletionRequest(OpenAIBaseModel):
1087
+ # Ordered by official OpenAI API documentation
1088
+ # https://platform.openai.com/docs/api-reference/completions/create
1089
+ model: str | None = None
1090
+ prompt: list[int] | list[list[int]] | str | list[str] | None = None
1091
+ best_of: int | None = None
1092
+ echo: bool | None = False
1093
+ frequency_penalty: float | None = 0.0
1094
+ logit_bias: dict[str, float] | None = None
1095
+ logprobs: int | None = None
1096
+ max_tokens: int | None = 16
1097
+ n: int = 1
1098
+ presence_penalty: float | None = 0.0
1099
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1100
+ stop: str | list[str] | None = []
1101
+ stream: bool | None = False
1102
+ stream_options: StreamOptions | None = None
1103
+ suffix: str | None = None
1104
+ temperature: float | None = None
1105
+ top_p: float | None = None
1106
+ user: str | None = None
1107
+
1108
+ # --8<-- [start:completion-sampling-params]
1109
+ use_beam_search: bool = False
1110
+ top_k: int | None = None
1111
+ min_p: float | None = None
1112
+ repetition_penalty: float | None = None
1113
+ length_penalty: float = 1.0
1114
+ stop_token_ids: list[int] | None = []
1115
+ include_stop_str_in_output: bool = False
1116
+ ignore_eos: bool = False
1117
+ min_tokens: int = 0
1118
+ skip_special_tokens: bool = True
1119
+ spaces_between_special_tokens: bool = True
1120
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
1121
+ allowed_token_ids: list[int] | None = None
1122
+ prompt_logprobs: int | None = None
1123
+ # --8<-- [end:completion-sampling-params]
1124
+
1125
+ # --8<-- [start:completion-extra-params]
1126
+ prompt_embeds: bytes | list[bytes] | None = None
1127
+ add_special_tokens: bool = Field(
1128
+ default=True,
1129
+ description=(
1130
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1131
+ "the prompt."
1132
+ ),
1133
+ )
1134
+ response_format: AnyResponseFormat | None = Field(
1135
+ default=None,
1136
+ description=(
1137
+ "Similar to chat completion, this parameter specifies the format "
1138
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
1139
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
1140
+ ),
1141
+ )
1142
+ structured_outputs: StructuredOutputsParams | None = Field(
1143
+ default=None,
1144
+ description="Additional kwargs for structured outputs",
1145
+ )
1146
+ guided_json: str | dict | BaseModel | None = Field(
1147
+ default=None,
1148
+ description=(
1149
+ "`guided_json` is deprecated. "
1150
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1151
+ "Please pass `json` to `structured_outputs` instead."
1152
+ ),
1153
+ )
1154
+ guided_regex: str | None = Field(
1155
+ default=None,
1156
+ description=(
1157
+ "`guided_regex` is deprecated. "
1158
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1159
+ "Please pass `regex` to `structured_outputs` instead."
1160
+ ),
1161
+ )
1162
+ guided_choice: list[str] | None = Field(
1163
+ default=None,
1164
+ description=(
1165
+ "`guided_choice` is deprecated. "
1166
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1167
+ "Please pass `choice` to `structured_outputs` instead."
1168
+ ),
1169
+ )
1170
+ guided_grammar: str | None = Field(
1171
+ default=None,
1172
+ description=(
1173
+ "`guided_grammar` is deprecated. "
1174
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1175
+ "Please pass `grammar` to `structured_outputs` instead."
1176
+ ),
1177
+ )
1178
+ structural_tag: str | None = Field(
1179
+ default=None,
1180
+ description=("If specified, the output will follow the structural tag schema."),
1181
+ )
1182
+ guided_decoding_backend: str | None = Field(
1183
+ default=None,
1184
+ description=(
1185
+ "`guided_decoding_backend` is deprecated. "
1186
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1187
+ "Please remove it from your request."
1188
+ ),
1189
+ )
1190
+ guided_whitespace_pattern: str | None = Field(
1191
+ default=None,
1192
+ description=(
1193
+ "`guided_whitespace_pattern` is deprecated. "
1194
+ "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
1195
+ "Please pass `whitespace_pattern` to `structured_outputs` instead."
1196
+ ),
1197
+ )
1198
+ priority: int = Field(
1199
+ default=0,
1200
+ description=(
1201
+ "The priority of the request (lower means earlier handling; "
1202
+ "default: 0). Any priority other than 0 will raise an error "
1203
+ "if the served model does not use priority scheduling."
1204
+ ),
1205
+ )
1206
+ request_id: str = Field(
1207
+ default_factory=lambda: f"{random_uuid()}",
1208
+ description=(
1209
+ "The request_id related to this request. If the caller does "
1210
+ "not set it, a random_uuid will be generated. This id is used "
1211
+ "through out the inference process and return in response."
1212
+ ),
1213
+ )
1214
+ logits_processors: LogitsProcessors | None = Field(
1215
+ default=None,
1216
+ description=(
1217
+ "A list of either qualified names of logits processors, or "
1218
+ "constructor objects, to apply when sampling. A constructor is "
1219
+ "a JSON object with a required 'qualname' field specifying the "
1220
+ "qualified name of the processor class/factory, and optional "
1221
+ "'args' and 'kwargs' fields containing positional and keyword "
1222
+ "arguments. For example: {'qualname': "
1223
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
1224
+ "{'param': 'value'}}."
1225
+ ),
1226
+ )
1227
+
1228
+ return_tokens_as_token_ids: bool | None = Field(
1229
+ default=None,
1230
+ description=(
1231
+ "If specified with 'logprobs', tokens are represented "
1232
+ " as strings of the form 'token_id:{token_id}' so that tokens "
1233
+ "that are not JSON-encodable can be identified."
1234
+ ),
1235
+ )
1236
+ return_token_ids: bool | None = Field(
1237
+ default=None,
1238
+ description=(
1239
+ "If specified, the result will include token IDs alongside the "
1240
+ "generated text. In streaming mode, prompt_token_ids is included "
1241
+ "only in the first chunk, and token_ids contains the delta tokens "
1242
+ "for each chunk. This is useful for debugging or when you "
1243
+ "need to map generated text back to input tokens."
1244
+ ),
1245
+ )
1246
+
1247
+ cache_salt: str | None = Field(
1248
+ default=None,
1249
+ description=(
1250
+ "If specified, the prefix cache will be salted with the provided "
1251
+ "string to prevent an attacker to guess prompts in multi-user "
1252
+ "environments. The salt should be random, protected from "
1253
+ "access by 3rd parties, and long enough to be "
1254
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
1255
+ "to 256 bit). Not supported by vLLM engine V0."
1256
+ ),
1257
+ )
1258
+
1259
+ kv_transfer_params: dict[str, Any] | None = Field(
1260
+ default=None,
1261
+ description="KVTransfer parameters used for disaggregated serving.",
1262
+ )
1263
+
1264
+ vllm_xargs: dict[str, str | int | float] | None = Field(
1265
+ default=None,
1266
+ description=(
1267
+ "Additional request parameters with string or "
1268
+ "numeric values, used by custom extensions."
1269
+ ),
1270
+ )
1271
+
1272
+ # --8<-- [end:completion-extra-params]
1273
+
1274
+ # Default sampling parameters for completion requests
1275
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1276
+ "repetition_penalty": 1.0,
1277
+ "temperature": 1.0,
1278
+ "top_p": 1.0,
1279
+ "top_k": 0,
1280
+ "min_p": 0.0,
1281
+ }
1282
+
1283
+ def to_beam_search_params(
1284
+ self,
1285
+ max_tokens: int,
1286
+ default_sampling_params: dict | None = None,
1287
+ ) -> BeamSearchParams:
1288
+ if default_sampling_params is None:
1289
+ default_sampling_params = {}
1290
+ n = self.n if self.n is not None else 1
1291
+
1292
+ if (temperature := self.temperature) is None:
1293
+ temperature = default_sampling_params.get("temperature", 1.0)
1294
+
1295
+ return BeamSearchParams(
1296
+ beam_width=n,
1297
+ max_tokens=max_tokens,
1298
+ ignore_eos=self.ignore_eos,
1299
+ temperature=temperature,
1300
+ length_penalty=self.length_penalty,
1301
+ include_stop_str_in_output=self.include_stop_str_in_output,
1302
+ )
1303
+
1304
+ def to_sampling_params(
1305
+ self,
1306
+ max_tokens: int,
1307
+ logits_processor_pattern: str | None,
1308
+ default_sampling_params: dict | None = None,
1309
+ ) -> SamplingParams:
1310
+ if default_sampling_params is None:
1311
+ default_sampling_params = {}
1312
+
1313
+ # Default parameters
1314
+ if (repetition_penalty := self.repetition_penalty) is None:
1315
+ repetition_penalty = default_sampling_params.get(
1316
+ "repetition_penalty",
1317
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
1318
+ )
1319
+ if (temperature := self.temperature) is None:
1320
+ temperature = default_sampling_params.get(
1321
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
1322
+ )
1323
+ if (top_p := self.top_p) is None:
1324
+ top_p = default_sampling_params.get(
1325
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
1326
+ )
1327
+ if (top_k := self.top_k) is None:
1328
+ top_k = default_sampling_params.get(
1329
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
1330
+ )
1331
+ if (min_p := self.min_p) is None:
1332
+ min_p = default_sampling_params.get(
1333
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
1334
+ )
1335
+
1336
+ prompt_logprobs = self.prompt_logprobs
1337
+ if prompt_logprobs is None and self.echo:
1338
+ prompt_logprobs = self.logprobs
1339
+
1340
+ echo_without_generation = self.echo and self.max_tokens == 0
1341
+
1342
+ guided_json_object = None
1343
+ if self.response_format is not None:
1344
+ if self.response_format.type == "json_object":
1345
+ guided_json_object = True
1346
+ elif self.response_format.type == "json_schema":
1347
+ json_schema = self.response_format.json_schema
1348
+ assert json_schema is not None
1349
+ self.guided_json = json_schema.json_schema
1350
+ elif self.response_format.type == "structural_tag":
1351
+ structural_tag = self.response_format
1352
+ assert structural_tag is not None and isinstance(
1353
+ structural_tag, StructuralTagResponseFormat
1354
+ )
1355
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
1356
+ self.structural_tag = json.dumps(s_tag_obj)
1357
+
1358
+ # Forward deprecated guided_* parameters to structured_outputs
1359
+ if self.structured_outputs is None:
1360
+ kwargs = dict[str, Any](
1361
+ json=self.guided_json,
1362
+ json_object=guided_json_object,
1363
+ regex=self.guided_regex,
1364
+ choice=self.guided_choice,
1365
+ grammar=self.guided_grammar,
1366
+ whitespace_pattern=self.guided_whitespace_pattern,
1367
+ )
1368
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
1369
+ if len(kwargs) > 0:
1370
+ self.structured_outputs = StructuredOutputsParams(**kwargs)
1371
+
1372
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
1373
+ if self.kv_transfer_params:
1374
+ # Pass in kv_transfer_params via extra_args
1375
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
1376
+ return SamplingParams.from_optional(
1377
+ n=self.n,
1378
+ best_of=self.best_of,
1379
+ presence_penalty=self.presence_penalty,
1380
+ frequency_penalty=self.frequency_penalty,
1381
+ repetition_penalty=repetition_penalty,
1382
+ temperature=temperature,
1383
+ top_p=top_p,
1384
+ top_k=top_k,
1385
+ min_p=min_p,
1386
+ seed=self.seed,
1387
+ stop=self.stop,
1388
+ stop_token_ids=self.stop_token_ids,
1389
+ logprobs=self.logprobs,
1390
+ ignore_eos=self.ignore_eos,
1391
+ max_tokens=max_tokens if not echo_without_generation else 1,
1392
+ min_tokens=self.min_tokens,
1393
+ prompt_logprobs=prompt_logprobs,
1394
+ skip_special_tokens=self.skip_special_tokens,
1395
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
1396
+ include_stop_str_in_output=self.include_stop_str_in_output,
1397
+ logits_processors=get_logits_processors(
1398
+ self.logits_processors, logits_processor_pattern
1399
+ ),
1400
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1401
+ output_kind=RequestOutputKind.DELTA
1402
+ if self.stream
1403
+ else RequestOutputKind.FINAL_ONLY,
1404
+ structured_outputs=self.structured_outputs,
1405
+ logit_bias=self.logit_bias,
1406
+ allowed_token_ids=self.allowed_token_ids,
1407
+ extra_args=extra_args or None,
1408
+ )
1409
+
1410
+ @model_validator(mode="before")
1411
+ @classmethod
1412
+ def check_structured_outputs_count(cls, data):
1413
+ if data.get("structured_outputs", None) is None:
1414
+ return data
1415
+
1416
+ structured_outputs_kwargs = data["structured_outputs"]
1417
+ count = sum(
1418
+ structured_outputs_kwargs.get(k) is not None
1419
+ for k in ("json", "regex", "choice")
1420
+ )
1421
+ if count > 1:
1422
+ raise ValueError(
1423
+ "You can only use one kind of constraints for structured "
1424
+ "outputs ('json', 'regex' or 'choice')."
1425
+ )
1426
+ return data
1427
+
1428
+ @model_validator(mode="before")
1429
+ @classmethod
1430
+ def check_logprobs(cls, data):
1431
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1432
+ if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
1433
+ raise ValueError(
1434
+ "`prompt_logprobs` are not available when `stream=True`."
1435
+ )
1436
+
1437
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
1438
+ raise ValueError("`prompt_logprobs` must be a positive value or -1.")
1439
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1440
+ raise ValueError("`logprobs` must be a positive value.")
1441
+
1442
+ return data
1443
+
1444
+ @model_validator(mode="before")
1445
+ @classmethod
1446
+ def validate_stream_options(cls, data):
1447
+ if data.get("stream_options") and not data.get("stream"):
1448
+ raise ValueError("Stream options can only be defined when `stream=True`.")
1449
+
1450
+ return data
1451
+
1452
+ @model_validator(mode="before")
1453
+ @classmethod
1454
+ def validate_prompt_and_prompt_embeds(cls, data):
1455
+ prompt = data.get("prompt")
1456
+ prompt_embeds = data.get("prompt_embeds")
1457
+
1458
+ prompt_is_empty = prompt is None or (isinstance(prompt, str) and prompt == "")
1459
+ embeds_is_empty = prompt_embeds is None or (
1460
+ isinstance(prompt_embeds, list) and len(prompt_embeds) == 0
1461
+ )
1462
+
1463
+ if prompt_is_empty and embeds_is_empty:
1464
+ raise ValueError(
1465
+ "Either prompt or prompt_embeds must be provided and non-empty."
1466
+ )
1467
+
1468
+ return data
1469
+
1470
+ @model_validator(mode="before")
1471
+ @classmethod
1472
+ def check_cache_salt_support(cls, data):
1473
+ if data.get("cache_salt") is not None and (
1474
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
1475
+ ):
1476
+ raise ValueError(
1477
+ "Parameter 'cache_salt' must be a non-empty string if provided."
1478
+ )
1479
+ return data
1480
+
1481
+
1482
+ class EmbeddingCompletionRequest(OpenAIBaseModel):
1483
+ # Ordered by official OpenAI API documentation
1484
+ # https://platform.openai.com/docs/api-reference/embeddings
1485
+ model: str | None = None
1486
+ input: list[int] | list[list[int]] | str | list[str]
1487
+ encoding_format: EncodingFormat = "float"
1488
+ dimensions: int | None = None
1489
+ user: str | None = None
1490
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
1491
+
1492
+ # --8<-- [start:embedding-extra-params]
1493
+ add_special_tokens: bool = Field(
1494
+ default=True,
1495
+ description=(
1496
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1497
+ "the prompt."
1498
+ ),
1499
+ )
1500
+ priority: int = Field(
1501
+ default=0,
1502
+ description=(
1503
+ "The priority of the request (lower means earlier handling; "
1504
+ "default: 0). Any priority other than 0 will raise an error "
1505
+ "if the served model does not use priority scheduling."
1506
+ ),
1507
+ )
1508
+ request_id: str = Field(
1509
+ default_factory=lambda: f"{random_uuid()}",
1510
+ description=(
1511
+ "The request_id related to this request. If the caller does "
1512
+ "not set it, a random_uuid will be generated. This id is used "
1513
+ "through out the inference process and return in response."
1514
+ ),
1515
+ )
1516
+ normalize: bool | None = Field(
1517
+ default=None,
1518
+ description="Whether to normalize the embeddings outputs. Default is True.",
1519
+ )
1520
+ embed_dtype: EmbedDType = Field(
1521
+ default="float32",
1522
+ description=(
1523
+ "What dtype to use for encoding. Default to using float32 for base64 "
1524
+ "encoding to match the OpenAI python client behavior. "
1525
+ "This parameter will affect base64 and binary_response."
1526
+ ),
1527
+ )
1528
+ endianness: Endianness = Field(
1529
+ default="native",
1530
+ description=(
1531
+ "What endianness to use for encoding. Default to using native for "
1532
+ "base64 encoding to match the OpenAI python client behavior."
1533
+ "This parameter will affect base64 and binary_response."
1534
+ ),
1535
+ )
1536
+ # --8<-- [end:embedding-extra-params]
1537
+
1538
+ def to_pooling_params(self):
1539
+ return PoolingParams(
1540
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1541
+ dimensions=self.dimensions,
1542
+ normalize=self.normalize,
1543
+ )
1544
+
1545
+
1546
+ class EmbeddingChatRequest(OpenAIBaseModel):
1547
+ model: str | None = None
1548
+ messages: list[ChatCompletionMessageParam]
1549
+
1550
+ encoding_format: EncodingFormat = "float"
1551
+ dimensions: int | None = None
1552
+ user: str | None = None
1553
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
1554
+
1555
+ # --8<-- [start:chat-embedding-extra-params]
1556
+ add_generation_prompt: bool = Field(
1557
+ default=False,
1558
+ description=(
1559
+ "If true, the generation prompt will be added to the chat template. "
1560
+ "This is a parameter used by chat template in tokenizer config of the "
1561
+ "model."
1562
+ ),
1563
+ )
1564
+
1565
+ add_special_tokens: bool = Field(
1566
+ default=False,
1567
+ description=(
1568
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1569
+ "on top of what is added by the chat template. "
1570
+ "For most models, the chat template takes care of adding the "
1571
+ "special tokens so this should be set to false (as is the "
1572
+ "default)."
1573
+ ),
1574
+ )
1575
+ chat_template: str | None = Field(
1576
+ default=None,
1577
+ description=(
1578
+ "A Jinja template to use for this conversion. "
1579
+ "As of transformers v4.44, default chat template is no longer "
1580
+ "allowed, so you must provide a chat template if the tokenizer "
1581
+ "does not define one."
1582
+ ),
1583
+ )
1584
+ chat_template_kwargs: dict[str, Any] | None = Field(
1585
+ default=None,
1586
+ description=(
1587
+ "Additional keyword args to pass to the template renderer. "
1588
+ "Will be accessible by the chat template."
1589
+ ),
1590
+ )
1591
+ mm_processor_kwargs: dict[str, Any] | None = Field(
1592
+ default=None,
1593
+ description=("Additional kwargs to pass to the HF processor."),
1594
+ )
1595
+ priority: int = Field(
1596
+ default=0,
1597
+ description=(
1598
+ "The priority of the request (lower means earlier handling; "
1599
+ "default: 0). Any priority other than 0 will raise an error "
1600
+ "if the served model does not use priority scheduling."
1601
+ ),
1602
+ )
1603
+ request_id: str = Field(
1604
+ default_factory=lambda: f"{random_uuid()}",
1605
+ description=(
1606
+ "The request_id related to this request. If the caller does "
1607
+ "not set it, a random_uuid will be generated. This id is used "
1608
+ "through out the inference process and return in response."
1609
+ ),
1610
+ )
1611
+ normalize: bool | None = Field(
1612
+ default=None,
1613
+ description="Whether to normalize the embeddings outputs. Default is True.",
1614
+ )
1615
+ embed_dtype: EmbedDType = Field(
1616
+ default="float32",
1617
+ description=(
1618
+ "What dtype to use for encoding. Default to using float32 for base64 "
1619
+ "encoding to match the OpenAI python client behavior. "
1620
+ "This parameter will affect base64 and binary_response."
1621
+ ),
1622
+ )
1623
+ endianness: Endianness = Field(
1624
+ default="native",
1625
+ description=(
1626
+ "What endianness to use for encoding. Default to using native for "
1627
+ "base64 encoding to match the OpenAI python client behavior."
1628
+ "This parameter will affect base64 and binary_response."
1629
+ ),
1630
+ )
1631
+ # --8<-- [end:chat-embedding-extra-params]
1632
+
1633
+ @model_validator(mode="before")
1634
+ @classmethod
1635
+ def check_generation_prompt(cls, data):
1636
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
1637
+ raise ValueError(
1638
+ "Cannot set both `continue_final_message` and "
1639
+ "`add_generation_prompt` to True."
1640
+ )
1641
+ return data
1642
+
1643
+ def to_pooling_params(self):
1644
+ return PoolingParams(
1645
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1646
+ dimensions=self.dimensions,
1647
+ normalize=self.normalize,
1648
+ )
1649
+
1650
+
1651
+ EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
1652
+
1653
+
1654
+ class PoolingCompletionRequest(EmbeddingCompletionRequest):
1655
+ task: PoolingTask | None = None
1656
+ softmax: bool | None = Field(
1657
+ default=None,
1658
+ description="softmax will be deprecated, please use use_activation instead.",
1659
+ )
1660
+ activation: bool | None = Field(
1661
+ default=None,
1662
+ description="activation will be deprecated, please use use_activation instead.",
1663
+ )
1664
+ use_activation: bool | None = Field(
1665
+ default=None,
1666
+ description="Whether to use activation for classification outputs. "
1667
+ "If it is a classify or token_classify task, the default is True; "
1668
+ "for other tasks, this value should be None.",
1669
+ )
1670
+
1671
+ def to_pooling_params(self):
1672
+ return PoolingParams(
1673
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1674
+ dimensions=self.dimensions,
1675
+ normalize=self.normalize,
1676
+ use_activation=get_use_activation(self),
1677
+ )
1678
+
1679
+
1680
+ class PoolingChatRequest(EmbeddingChatRequest):
1681
+ task: PoolingTask | None = None
1682
+ softmax: bool | None = Field(
1683
+ default=None,
1684
+ description="softmax will be deprecated, please use use_activation instead.",
1685
+ )
1686
+ activation: bool | None = Field(
1687
+ default=None,
1688
+ description="activation will be deprecated, please use use_activation instead.",
1689
+ )
1690
+ use_activation: bool | None = Field(
1691
+ default=None,
1692
+ description="Whether to use activation for classification outputs. "
1693
+ "If it is a classify or token_classify task, the default is True; "
1694
+ "for other tasks, this value should be None.",
1695
+ )
1696
+
1697
+ def to_pooling_params(self):
1698
+ return PoolingParams(
1699
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1700
+ dimensions=self.dimensions,
1701
+ normalize=self.normalize,
1702
+ use_activation=get_use_activation(self),
1703
+ )
1704
+
1705
+
1706
+ T = TypeVar("T")
1707
+
1708
+
1709
+ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
1710
+ model: str | None = None
1711
+
1712
+ priority: int = Field(default=0)
1713
+ """
1714
+ The priority of the request (lower means earlier handling;
1715
+ default: 0). Any priority other than 0 will raise an error
1716
+ if the served model does not use priority scheduling.
1717
+ """
1718
+ data: T
1719
+
1720
+ task: PoolingTask = "plugin"
1721
+ encoding_format: EncodingFormat = "float"
1722
+ embed_dtype: EmbedDType = Field(
1723
+ default="float32",
1724
+ description=(
1725
+ "What dtype to use for encoding. Default to using float32 for base64 "
1726
+ "encoding to match the OpenAI python client behavior. "
1727
+ "This parameter will affect base64 and binary_response."
1728
+ ),
1729
+ )
1730
+ endianness: Endianness = Field(
1731
+ default="native",
1732
+ description=(
1733
+ "What endianness to use for encoding. Default to using native for "
1734
+ "base64 encoding to match the OpenAI python client behavior."
1735
+ "This parameter will affect base64 and binary_response."
1736
+ ),
1737
+ )
1738
+
1739
+ def to_pooling_params(self):
1740
+ return PoolingParams()
1741
+
1742
+
1743
+ class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
1744
+ request_id: str | None = None
1745
+ """
1746
+ The request_id associated with this response
1747
+ """
1748
+ created_at: int = Field(default_factory=lambda: int(time.time()))
1749
+
1750
+ data: T
1751
+ """
1752
+ When using plugins IOProcessor plugins, the actual output is generated
1753
+ by the plugin itself. Hence, we use a generic type for the response data
1754
+ """
1755
+
1756
+
1757
+ PoolingRequest: TypeAlias = (
1758
+ PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
1759
+ )
1760
+
1761
+
1762
+ class ScoreRequest(OpenAIBaseModel):
1763
+ model: str | None = None
1764
+ text_1: list[str] | str | ScoreMultiModalParam
1765
+ text_2: list[str] | str | ScoreMultiModalParam
1766
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
1767
+
1768
+ # --8<-- [start:score-extra-params]
1769
+
1770
+ mm_processor_kwargs: dict[str, Any] | None = Field(
1771
+ default=None,
1772
+ description=("Additional kwargs to pass to the HF processor."),
1773
+ )
1774
+
1775
+ priority: int = Field(
1776
+ default=0,
1777
+ description=(
1778
+ "The priority of the request (lower means earlier handling; "
1779
+ "default: 0). Any priority other than 0 will raise an error "
1780
+ "if the served model does not use priority scheduling."
1781
+ ),
1782
+ )
1783
+
1784
+ softmax: bool | None = Field(
1785
+ default=None,
1786
+ description="softmax will be deprecated, please use use_activation instead.",
1787
+ )
1788
+
1789
+ activation: bool | None = Field(
1790
+ default=None,
1791
+ description="activation will be deprecated, please use use_activation instead.",
1792
+ )
1793
+
1794
+ use_activation: bool | None = Field(
1795
+ default=None,
1796
+ description="Whether to use activation for classification outputs. "
1797
+ "Default is True.",
1798
+ )
1799
+ # --8<-- [end:score-extra-params]
1800
+
1801
+ def to_pooling_params(self):
1802
+ return PoolingParams(
1803
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1804
+ use_activation=get_use_activation(self),
1805
+ )
1806
+
1807
+
1808
+ class RerankRequest(OpenAIBaseModel):
1809
+ model: str | None = None
1810
+ query: str | ScoreMultiModalParam
1811
+ documents: list[str] | ScoreMultiModalParam
1812
+ top_n: int = Field(default_factory=lambda: 0)
1813
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
1814
+
1815
+ # --8<-- [start:rerank-extra-params]
1816
+
1817
+ mm_processor_kwargs: dict[str, Any] | None = Field(
1818
+ default=None,
1819
+ description=("Additional kwargs to pass to the HF processor."),
1820
+ )
1821
+
1822
+ priority: int = Field(
1823
+ default=0,
1824
+ description=(
1825
+ "The priority of the request (lower means earlier handling; "
1826
+ "default: 0). Any priority other than 0 will raise an error "
1827
+ "if the served model does not use priority scheduling."
1828
+ ),
1829
+ )
1830
+
1831
+ softmax: bool | None = Field(
1832
+ default=None,
1833
+ description="softmax will be deprecated, please use use_activation instead.",
1834
+ )
1835
+
1836
+ activation: bool | None = Field(
1837
+ default=None,
1838
+ description="activation will be deprecated, please use use_activation instead.",
1839
+ )
1840
+
1841
+ use_activation: bool | None = Field(
1842
+ default=None,
1843
+ description="Whether to use activation for classification outputs. "
1844
+ "Default is True.",
1845
+ )
1846
+ # --8<-- [end:rerank-extra-params]
1847
+
1848
+ def to_pooling_params(self):
1849
+ return PoolingParams(
1850
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1851
+ use_activation=get_use_activation(self),
1852
+ )
1853
+
1854
+
1855
+ class RerankDocument(BaseModel):
1856
+ text: str | None = None
1857
+ multi_modal: ScoreContentPartParam | None = None
1858
+
1859
+
1860
+ class RerankResult(BaseModel):
1861
+ index: int
1862
+ document: RerankDocument
1863
+ relevance_score: float
1864
+
1865
+
1866
+ class RerankUsage(BaseModel):
1867
+ total_tokens: int
1868
+
1869
+
1870
+ class RerankResponse(OpenAIBaseModel):
1871
+ id: str
1872
+ model: str
1873
+ usage: RerankUsage
1874
+ results: list[RerankResult]
1875
+
1876
+
1877
+ class CompletionLogProbs(OpenAIBaseModel):
1878
+ text_offset: list[int] = Field(default_factory=list)
1879
+ token_logprobs: list[float | None] = Field(default_factory=list)
1880
+ tokens: list[str] = Field(default_factory=list)
1881
+ top_logprobs: list[dict[str, float] | None] = Field(default_factory=list)
1882
+
1883
+
1884
+ class CompletionResponseChoice(OpenAIBaseModel):
1885
+ index: int
1886
+ text: str
1887
+ logprobs: CompletionLogProbs | None = None
1888
+ finish_reason: str | None = None
1889
+ stop_reason: int | str | None = Field(
1890
+ default=None,
1891
+ description=(
1892
+ "The stop string or token id that caused the completion "
1893
+ "to stop, None if the completion finished for some other reason "
1894
+ "including encountering the EOS token"
1895
+ ),
1896
+ )
1897
+ token_ids: list[int] | None = None # For response
1898
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
1899
+ prompt_token_ids: list[int] | None = None # For prompt
1900
+
1901
+
1902
+ class CompletionResponse(OpenAIBaseModel):
1903
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1904
+ object: Literal["text_completion"] = "text_completion"
1905
+ created: int = Field(default_factory=lambda: int(time.time()))
1906
+ model: str
1907
+ choices: list[CompletionResponseChoice]
1908
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
1909
+ system_fingerprint: str | None = None
1910
+ usage: UsageInfo
1911
+
1912
+ # vLLM-specific fields that are not in OpenAI spec
1913
+ kv_transfer_params: dict[str, Any] | None = Field(
1914
+ default=None, description="KVTransfer parameters."
1915
+ )
1916
+
1917
+
1918
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1919
+ index: int
1920
+ text: str
1921
+ logprobs: CompletionLogProbs | None = None
1922
+ finish_reason: str | None = None
1923
+ stop_reason: int | str | None = Field(
1924
+ default=None,
1925
+ description=(
1926
+ "The stop string or token id that caused the completion "
1927
+ "to stop, None if the completion finished for some other reason "
1928
+ "including encountering the EOS token"
1929
+ ),
1930
+ )
1931
+ # not part of the OpenAI spec but for tracing the tokens
1932
+ # prompt tokens is put into choice to align with CompletionResponseChoice
1933
+ prompt_token_ids: list[int] | None = None
1934
+ token_ids: list[int] | None = None
1935
+
1936
+
1937
+ class CompletionStreamResponse(OpenAIBaseModel):
1938
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1939
+ object: str = "text_completion"
1940
+ created: int = Field(default_factory=lambda: int(time.time()))
1941
+ model: str
1942
+ choices: list[CompletionResponseStreamChoice]
1943
+ usage: UsageInfo | None = Field(default=None)
1944
+
1945
+
1946
+ class EmbeddingResponseData(OpenAIBaseModel):
1947
+ index: int
1948
+ object: str = "embedding"
1949
+ embedding: list[float] | str
1950
+
1951
+
1952
+ class EmbeddingResponse(OpenAIBaseModel):
1953
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1954
+ object: str = "list"
1955
+ created: int = Field(default_factory=lambda: int(time.time()))
1956
+ model: str
1957
+ data: list[EmbeddingResponseData]
1958
+ usage: UsageInfo
1959
+
1960
+
1961
+ class EmbeddingBytesResponse(OpenAIBaseModel):
1962
+ body: list[bytes]
1963
+ metadata: str
1964
+ media_type: str = "application/octet-stream"
1965
+
1966
+
1967
+ class PoolingResponseData(OpenAIBaseModel):
1968
+ index: int
1969
+ object: str = "pooling"
1970
+ data: list[list[float]] | list[float] | str
1971
+
1972
+
1973
+ class PoolingResponse(OpenAIBaseModel):
1974
+ id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
1975
+ object: str = "list"
1976
+ created: int = Field(default_factory=lambda: int(time.time()))
1977
+ model: str
1978
+ data: list[PoolingResponseData]
1979
+ usage: UsageInfo
1980
+
1981
+
1982
+ class PoolingBytesResponse(OpenAIBaseModel):
1983
+ body: list[bytes]
1984
+ metadata: str
1985
+ media_type: str = "application/octet-stream"
1986
+
1987
+
1988
+ class ScoreResponseData(OpenAIBaseModel):
1989
+ index: int
1990
+ object: str = "score"
1991
+ score: float
1992
+
1993
+
1994
+ class ScoreResponse(OpenAIBaseModel):
1995
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1996
+ object: str = "list"
1997
+ created: int = Field(default_factory=lambda: int(time.time()))
1998
+ model: str
1999
+ data: list[ScoreResponseData]
2000
+ usage: UsageInfo
2001
+
2002
+
2003
+ class ClassificationCompletionRequest(OpenAIBaseModel):
2004
+ model: str | None = None
2005
+ input: list[str] | str
2006
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
2007
+ user: str | None = None
2008
+
2009
+ # --8<-- [start:classification-extra-params]
2010
+ priority: int = Field(
2011
+ default=0,
2012
+ description=(
2013
+ "The priority of the request (lower means earlier handling; "
2014
+ "default: 0). Any priority other than 0 will raise an error "
2015
+ "if the served model does not use priority scheduling."
2016
+ ),
2017
+ )
2018
+ add_special_tokens: bool = Field(
2019
+ default=True,
2020
+ description=(
2021
+ "If true (the default), special tokens (e.g. BOS) will be added to "
2022
+ "the prompt."
2023
+ ),
2024
+ )
2025
+ request_id: str = Field(
2026
+ default_factory=lambda: f"{random_uuid()}",
2027
+ description=(
2028
+ "The request_id related to this request. If the caller does "
2029
+ "not set it, a random_uuid will be generated. This id is used "
2030
+ "through out the inference process and return in response."
2031
+ ),
2032
+ )
2033
+ softmax: bool | None = Field(
2034
+ default=None,
2035
+ description="softmax will be deprecated, please use use_activation instead.",
2036
+ )
2037
+
2038
+ activation: bool | None = Field(
2039
+ default=None,
2040
+ description="activation will be deprecated, please use use_activation instead.",
2041
+ )
2042
+
2043
+ use_activation: bool | None = Field(
2044
+ default=None,
2045
+ description="Whether to use activation for classification outputs. "
2046
+ "Default is True.",
2047
+ )
2048
+ # --8<-- [end:classification-extra-params]
2049
+
2050
+ def to_pooling_params(self):
2051
+ return PoolingParams(
2052
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
2053
+ use_activation=get_use_activation(self),
2054
+ )
2055
+
2056
+
2057
+ class ClassificationChatRequest(OpenAIBaseModel):
2058
+ model: str | None = None
2059
+ messages: list[ChatCompletionMessageParam]
2060
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
2061
+ user: str | None = None
2062
+
2063
+ # --8<-- [start:chat-classification-extra-params]
2064
+ add_generation_prompt: bool = Field(
2065
+ default=False,
2066
+ description=(
2067
+ "If true, the generation prompt will be added to the chat template. "
2068
+ "This is a parameter used by chat template in tokenizer config of the "
2069
+ "model."
2070
+ ),
2071
+ )
2072
+
2073
+ add_special_tokens: bool = Field(
2074
+ default=False,
2075
+ description=(
2076
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
2077
+ "on top of what is added by the chat template. "
2078
+ "For most models, the chat template takes care of adding the "
2079
+ "special tokens so this should be set to false (as is the "
2080
+ "default)."
2081
+ ),
2082
+ )
2083
+
2084
+ chat_template: str | None = Field(
2085
+ default=None,
2086
+ description=(
2087
+ "A Jinja template to use for this conversion. "
2088
+ "As of transformers v4.44, default chat template is no longer "
2089
+ "allowed, so you must provide a chat template if the tokenizer "
2090
+ "does not define one."
2091
+ ),
2092
+ )
2093
+
2094
+ chat_template_kwargs: dict[str, Any] | None = Field(
2095
+ default=None,
2096
+ description=(
2097
+ "Additional keyword args to pass to the template renderer. "
2098
+ "Will be accessible by the chat template."
2099
+ ),
2100
+ )
2101
+
2102
+ mm_processor_kwargs: dict[str, Any] | None = Field(
2103
+ default=None,
2104
+ description=("Additional kwargs to pass to the HF processor."),
2105
+ )
2106
+
2107
+ priority: int = Field(
2108
+ default=0,
2109
+ description=(
2110
+ "The priority of the request (lower means earlier handling; "
2111
+ "default: 0). Any priority other than 0 will raise an error "
2112
+ "if the served model does not use priority scheduling."
2113
+ ),
2114
+ )
2115
+
2116
+ request_id: str = Field(
2117
+ default_factory=lambda: f"{random_uuid()}",
2118
+ description=(
2119
+ "The request_id related to this request. If the caller does "
2120
+ "not set it, a random_uuid will be generated. This id is used "
2121
+ "through out the inference process and return in response."
2122
+ ),
2123
+ )
2124
+ softmax: bool | None = Field(
2125
+ default=None,
2126
+ description="softmax will be deprecated, please use use_activation instead.",
2127
+ )
2128
+
2129
+ activation: bool | None = Field(
2130
+ default=None,
2131
+ description="activation will be deprecated, please use use_activation instead.",
2132
+ )
2133
+
2134
+ use_activation: bool | None = Field(
2135
+ default=None,
2136
+ description="Whether to use activation for classification outputs. "
2137
+ "Default is True.",
2138
+ )
2139
+ # --8<-- [end:chat-classification-extra-params]
2140
+
2141
+ def to_pooling_params(self):
2142
+ return PoolingParams(
2143
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
2144
+ use_activation=get_use_activation(self),
2145
+ )
2146
+
2147
+
2148
+ ClassificationRequest: TypeAlias = (
2149
+ ClassificationCompletionRequest | ClassificationChatRequest
2150
+ )
2151
+
2152
+
2153
+ class ClassificationData(OpenAIBaseModel):
2154
+ index: int
2155
+ label: str | None
2156
+ probs: list[float]
2157
+ num_classes: int
2158
+
2159
+
2160
+ class ClassificationResponse(OpenAIBaseModel):
2161
+ id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
2162
+ object: str = "list"
2163
+ created: int = Field(default_factory=lambda: int(time.time()))
2164
+ model: str
2165
+ data: list[ClassificationData]
2166
+ usage: UsageInfo
2167
+
2168
+
2169
+ class FunctionCall(OpenAIBaseModel):
2170
+ name: str
2171
+ arguments: str
2172
+
2173
+
2174
+ class ToolCall(OpenAIBaseModel):
2175
+ id: str = Field(default_factory=make_tool_call_id)
2176
+ type: Literal["function"] = "function"
2177
+ function: FunctionCall
2178
+
2179
+
2180
+ class DeltaFunctionCall(BaseModel):
2181
+ name: str | None = None
2182
+ arguments: str | None = None
2183
+
2184
+
2185
+ # a tool call delta where everything is optional
2186
+ class DeltaToolCall(OpenAIBaseModel):
2187
+ id: str | None = None
2188
+ type: Literal["function"] | None = None
2189
+ index: int
2190
+ function: DeltaFunctionCall | None = None
2191
+
2192
+
2193
+ class ExtractedToolCallInformation(BaseModel):
2194
+ # indicate if tools were called
2195
+ tools_called: bool
2196
+
2197
+ # extracted tool calls
2198
+ tool_calls: list[ToolCall]
2199
+
2200
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
2201
+ # But some models will do this intentionally
2202
+ content: str | None = None
2203
+
2204
+
2205
+ class ChatMessage(OpenAIBaseModel):
2206
+ role: str
2207
+ content: str | None = None
2208
+ refusal: str | None = None
2209
+ annotations: OpenAIAnnotation | None = None
2210
+ audio: OpenAIChatCompletionAudio | None = None
2211
+ function_call: FunctionCall | None = None
2212
+ tool_calls: list[ToolCall] = Field(default_factory=list)
2213
+
2214
+ # vLLM-specific fields that are not in OpenAI spec
2215
+ reasoning: str | None = None
2216
+ reasoning_content: str | None = None
2217
+ """Deprecated: use `reasoning` instead."""
2218
+
2219
+ @model_validator(mode="after")
2220
+ def handle_deprecated_reasoning_content(self):
2221
+ """Copy reasoning to reasoning_content for backward compatibility."""
2222
+ self.reasoning_content = self.reasoning
2223
+ return self
2224
+
2225
+
2226
+ class ChatCompletionLogProb(OpenAIBaseModel):
2227
+ token: str
2228
+ logprob: float = -9999.0
2229
+ bytes: list[int] | None = None
2230
+
2231
+
2232
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
2233
+ # Workaround: redefine fields name cache so that it's not
2234
+ # shared with the super class.
2235
+ field_names: ClassVar[set[str] | None] = None
2236
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
2237
+
2238
+
2239
+ class ChatCompletionLogProbs(OpenAIBaseModel):
2240
+ content: list[ChatCompletionLogProbsContent] | None = None
2241
+
2242
+
2243
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
2244
+ index: int
2245
+ message: ChatMessage
2246
+ logprobs: ChatCompletionLogProbs | None = None
2247
+ # per OpenAI spec this is the default
2248
+ finish_reason: str | None = "stop"
2249
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
2250
+ stop_reason: int | str | None = None
2251
+ # not part of the OpenAI spec but is useful for tracing the tokens
2252
+ # in agent scenarios
2253
+ token_ids: list[int] | None = None
2254
+
2255
+
2256
+ class ChatCompletionResponse(OpenAIBaseModel):
2257
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
2258
+ object: Literal["chat.completion"] = "chat.completion"
2259
+ created: int = Field(default_factory=lambda: int(time.time()))
2260
+ model: str
2261
+ choices: list[ChatCompletionResponseChoice]
2262
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
2263
+ system_fingerprint: str | None = None
2264
+ usage: UsageInfo
2265
+
2266
+ # vLLM-specific fields that are not in OpenAI spec
2267
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
2268
+ prompt_token_ids: list[int] | None = None
2269
+ kv_transfer_params: dict[str, Any] | None = Field(
2270
+ default=None, description="KVTransfer parameters."
2271
+ )
2272
+
2273
+
2274
+ class DeltaMessage(OpenAIBaseModel):
2275
+ role: str | None = None
2276
+ content: str | None = None
2277
+ reasoning: str | None = None
2278
+ reasoning_content: str | None = None
2279
+ """Deprecated: use `reasoning` instead."""
2280
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
2281
+
2282
+ @model_validator(mode="after")
2283
+ def handle_deprecated_reasoning_content(self):
2284
+ """Copy reasoning to reasoning_content for backward compatibility."""
2285
+ self.reasoning_content = self.reasoning
2286
+ return self
2287
+
2288
+
2289
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
2290
+ index: int
2291
+ delta: DeltaMessage
2292
+ logprobs: ChatCompletionLogProbs | None = None
2293
+ finish_reason: str | None = None
2294
+ stop_reason: int | str | None = None
2295
+ # not part of the OpenAI spec but for tracing the tokens
2296
+ token_ids: list[int] | None = None
2297
+
2298
+
2299
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
2300
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
2301
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
2302
+ created: int = Field(default_factory=lambda: int(time.time()))
2303
+ model: str
2304
+ choices: list[ChatCompletionResponseStreamChoice]
2305
+ usage: UsageInfo | None = Field(default=None)
2306
+ # not part of the OpenAI spec but for tracing the tokens
2307
+ prompt_token_ids: list[int] | None = None
2308
+
2309
+
2310
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
2311
+ delta: DeltaMessage
2312
+ finish_reason: str | None = None
2313
+ stop_reason: int | str | None = None
2314
+
2315
+
2316
+ class TranscriptionStreamResponse(OpenAIBaseModel):
2317
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
2318
+ object: Literal["transcription.chunk"] = "transcription.chunk"
2319
+ created: int = Field(default_factory=lambda: int(time.time()))
2320
+ model: str
2321
+ choices: list[TranscriptionResponseStreamChoice]
2322
+ usage: UsageInfo | None = Field(default=None)
2323
+
2324
+
2325
+ class InputTokensDetails(OpenAIBaseModel):
2326
+ cached_tokens: int
2327
+ input_tokens_per_turn: list[int] = Field(default_factory=list)
2328
+ cached_tokens_per_turn: list[int] = Field(default_factory=list)
2329
+
2330
+
2331
+ class OutputTokensDetails(OpenAIBaseModel):
2332
+ reasoning_tokens: int = 0
2333
+ tool_output_tokens: int = 0
2334
+ output_tokens_per_turn: list[int] = Field(default_factory=list)
2335
+ tool_output_tokens_per_turn: list[int] = Field(default_factory=list)
2336
+
2337
+
2338
+ class ResponseUsage(OpenAIBaseModel):
2339
+ input_tokens: int
2340
+ input_tokens_details: InputTokensDetails
2341
+ output_tokens: int
2342
+ output_tokens_details: OutputTokensDetails
2343
+ total_tokens: int
2344
+
2345
+
2346
+ def serialize_message(msg):
2347
+ """
2348
+ Serializes a single message
2349
+ """
2350
+ if isinstance(msg, dict):
2351
+ return msg
2352
+ elif hasattr(msg, "to_dict"):
2353
+ return msg.to_dict()
2354
+ else:
2355
+ # fallback to pyandic dump
2356
+ return msg.model_dump_json()
2357
+
2358
+
2359
+ def serialize_messages(msgs):
2360
+ """
2361
+ Serializes multiple messages
2362
+ """
2363
+ return [serialize_message(msg) for msg in msgs] if msgs else None
2364
+
2365
+
2366
+ class ResponsesResponse(OpenAIBaseModel):
2367
+ id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
2368
+ created_at: int = Field(default_factory=lambda: int(time.time()))
2369
+ # error: Optional[ResponseError] = None
2370
+ incomplete_details: IncompleteDetails | None = None
2371
+ instructions: str | None = None
2372
+ metadata: Metadata | None = None
2373
+ model: str
2374
+ object: Literal["response"] = "response"
2375
+ output: list[ResponseOutputItem]
2376
+ parallel_tool_calls: bool
2377
+ temperature: float
2378
+ tool_choice: ToolChoice
2379
+ tools: list[Tool]
2380
+ top_p: float
2381
+ background: bool
2382
+ max_output_tokens: int
2383
+ max_tool_calls: int | None = None
2384
+ previous_response_id: str | None = None
2385
+ prompt: ResponsePrompt | None = None
2386
+ reasoning: Reasoning | None = None
2387
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"]
2388
+ status: ResponseStatus
2389
+ text: ResponseTextConfig | None = None
2390
+ top_logprobs: int | None = None
2391
+ truncation: Literal["auto", "disabled"]
2392
+ usage: ResponseUsage | None = None
2393
+ user: str | None = None
2394
+
2395
+ # --8<-- [start:responses-extra-params]
2396
+ # These are populated when enable_response_messages is set to True
2397
+ # NOTE: custom serialization is needed
2398
+ # see serialize_input_messages and serialize_output_messages
2399
+ input_messages: list[ChatCompletionMessageParam] | None = None
2400
+ output_messages: list[ChatCompletionMessageParam] | None = None
2401
+ # --8<-- [end:responses-extra-params]
2402
+
2403
+ # NOTE: openAI harmony doesn't serialize TextContent properly,
2404
+ # TODO: this fixes for TextContent, but need to verify for tools etc
2405
+ # https://github.com/openai/harmony/issues/78
2406
+ @field_serializer("output_messages", when_used="json")
2407
+ def serialize_output_messages(self, msgs, _info):
2408
+ return serialize_messages(msgs)
2409
+
2410
+ # NOTE: openAI harmony doesn't serialize TextContent properly, this fixes it
2411
+ # https://github.com/openai/harmony/issues/78
2412
+ @field_serializer("input_messages", when_used="json")
2413
+ def serialize_input_messages(self, msgs, _info):
2414
+ return serialize_messages(msgs)
2415
+
2416
+ @classmethod
2417
+ def from_request(
2418
+ cls,
2419
+ request: ResponsesRequest,
2420
+ sampling_params: SamplingParams,
2421
+ model_name: str,
2422
+ created_time: int,
2423
+ output: list[ResponseOutputItem],
2424
+ status: ResponseStatus,
2425
+ usage: ResponseUsage | None = None,
2426
+ input_messages: list[ChatCompletionMessageParam] | None = None,
2427
+ output_messages: list[ChatCompletionMessageParam] | None = None,
2428
+ ) -> "ResponsesResponse":
2429
+ incomplete_details: IncompleteDetails | None = None
2430
+ if status == "incomplete":
2431
+ incomplete_details = IncompleteDetails(reason="max_output_tokens")
2432
+ # TODO: implement the other reason for incomplete_details,
2433
+ # which is content_filter
2434
+ # incomplete_details = IncompleteDetails(reason='content_filter')
2435
+ return cls(
2436
+ id=request.request_id,
2437
+ created_at=created_time,
2438
+ incomplete_details=incomplete_details,
2439
+ instructions=request.instructions,
2440
+ metadata=request.metadata,
2441
+ model=model_name,
2442
+ output=output,
2443
+ input_messages=input_messages,
2444
+ output_messages=output_messages,
2445
+ parallel_tool_calls=request.parallel_tool_calls,
2446
+ temperature=sampling_params.temperature,
2447
+ tool_choice=request.tool_choice,
2448
+ tools=request.tools,
2449
+ top_p=sampling_params.top_p,
2450
+ background=request.background,
2451
+ max_output_tokens=sampling_params.max_tokens,
2452
+ max_tool_calls=request.max_tool_calls,
2453
+ previous_response_id=request.previous_response_id,
2454
+ prompt=request.prompt,
2455
+ reasoning=request.reasoning,
2456
+ service_tier=request.service_tier,
2457
+ status=status,
2458
+ text=request.text,
2459
+ top_logprobs=sampling_params.logprobs,
2460
+ truncation=request.truncation,
2461
+ user=request.user,
2462
+ usage=usage,
2463
+ )
2464
+
2465
+
2466
+ # TODO: this code can be removed once
2467
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
2468
+ class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
2469
+ content_index: int
2470
+ """The index of the content part that is done."""
2471
+
2472
+ item_id: str
2473
+ """The ID of the output item that the content part was added to."""
2474
+
2475
+ output_index: int
2476
+ """The index of the output item that the content part was added to."""
2477
+
2478
+ part: ResponseReasoningTextContent
2479
+ """The content part that is done."""
2480
+
2481
+ sequence_number: int
2482
+ """The sequence number of this event."""
2483
+
2484
+ type: Literal["response.reasoning_part.done"]
2485
+ """The type of the event. Always `response.reasoning_part.done`."""
2486
+
2487
+
2488
+ # TODO: this code can be removed once
2489
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
2490
+ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
2491
+ content_index: int
2492
+ """The index of the content part that is done."""
2493
+
2494
+ item_id: str
2495
+ """The ID of the output item that the content part was added to."""
2496
+
2497
+ output_index: int
2498
+ """The index of the output item that the content part was added to."""
2499
+
2500
+ part: ResponseReasoningTextContent
2501
+ """The content part that is done."""
2502
+
2503
+ sequence_number: int
2504
+ """The sequence number of this event."""
2505
+
2506
+ type: Literal["response.reasoning_part.added"]
2507
+ """The type of the event. Always `response.reasoning_part.added`."""
2508
+
2509
+
2510
+ # vLLM Streaming Events
2511
+ # Note: we override the response type with the vLLM ResponsesResponse type
2512
+ class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
2513
+ response: ResponsesResponse # type: ignore[override]
2514
+
2515
+
2516
+ class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
2517
+ response: ResponsesResponse # type: ignore[override]
2518
+
2519
+
2520
+ class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
2521
+ response: ResponsesResponse # type: ignore[override]
2522
+
2523
+
2524
+ StreamingResponsesResponse: TypeAlias = (
2525
+ ResponseCreatedEvent
2526
+ | ResponseInProgressEvent
2527
+ | ResponseCompletedEvent
2528
+ | ResponseOutputItemAddedEvent
2529
+ | ResponseOutputItemDoneEvent
2530
+ | ResponseContentPartAddedEvent
2531
+ | ResponseContentPartDoneEvent
2532
+ | ResponseReasoningTextDeltaEvent
2533
+ | ResponseReasoningTextDoneEvent
2534
+ | ResponseReasoningPartAddedEvent
2535
+ | ResponseReasoningPartDoneEvent
2536
+ | ResponseCodeInterpreterCallInProgressEvent
2537
+ | ResponseCodeInterpreterCallCodeDeltaEvent
2538
+ | ResponseWebSearchCallInProgressEvent
2539
+ | ResponseWebSearchCallSearchingEvent
2540
+ | ResponseWebSearchCallCompletedEvent
2541
+ | ResponseCodeInterpreterCallCodeDoneEvent
2542
+ | ResponseCodeInterpreterCallInterpretingEvent
2543
+ | ResponseCodeInterpreterCallCompletedEvent
2544
+ )
2545
+
2546
+ BatchRequestInputBody: TypeAlias = (
2547
+ ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
2548
+ )
2549
+
2550
+
2551
+ class BatchRequestInput(OpenAIBaseModel):
2552
+ """
2553
+ The per-line object of the batch input file.
2554
+
2555
+ NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
2556
+ """
2557
+
2558
+ # A developer-provided per-request id that will be used to match outputs to
2559
+ # inputs. Must be unique for each request in a batch.
2560
+ custom_id: str
2561
+
2562
+ # The HTTP method to be used for the request. Currently only POST is
2563
+ # supported.
2564
+ method: str
2565
+
2566
+ # The OpenAI API relative URL to be used for the request. Currently
2567
+ # /v1/chat/completions is supported.
2568
+ url: str
2569
+
2570
+ # The parameters of the request.
2571
+ body: BatchRequestInputBody
2572
+
2573
+ @field_validator("body", mode="plain")
2574
+ @classmethod
2575
+ def check_type_for_url(cls, value: Any, info: ValidationInfo):
2576
+ # Use url to disambiguate models
2577
+ url: str = info.data["url"]
2578
+ if url == "/v1/chat/completions":
2579
+ return ChatCompletionRequest.model_validate(value)
2580
+ if url == "/v1/embeddings":
2581
+ return TypeAdapter(EmbeddingRequest).validate_python(value)
2582
+ if url.endswith("/score"):
2583
+ return ScoreRequest.model_validate(value)
2584
+ if url.endswith("/rerank"):
2585
+ return RerankRequest.model_validate(value)
2586
+ return TypeAdapter(BatchRequestInputBody).validate_python(value)
2587
+
2588
+
2589
+ class BatchResponseData(OpenAIBaseModel):
2590
+ # HTTP status code of the response.
2591
+ status_code: int = 200
2592
+
2593
+ # An unique identifier for the API request.
2594
+ request_id: str
2595
+
2596
+ # The body of the response.
2597
+ body: (
2598
+ ChatCompletionResponse
2599
+ | EmbeddingResponse
2600
+ | ScoreResponse
2601
+ | RerankResponse
2602
+ | None
2603
+ ) = None
2604
+
2605
+
2606
+ class BatchRequestOutput(OpenAIBaseModel):
2607
+ """
2608
+ The per-line object of the batch output and error files
2609
+ """
2610
+
2611
+ id: str
2612
+
2613
+ # A developer-provided per-request id that will be used to match outputs to
2614
+ # inputs.
2615
+ custom_id: str
2616
+
2617
+ response: BatchResponseData | None
2618
+
2619
+ # For requests that failed with a non-HTTP error, this will contain more
2620
+ # information on the cause of the failure.
2621
+ error: Any | None
2622
+
2623
+
2624
+ class TokenizeCompletionRequest(OpenAIBaseModel):
2625
+ model: str | None = None
2626
+ prompt: str
2627
+
2628
+ add_special_tokens: bool = Field(
2629
+ default=True,
2630
+ description=(
2631
+ "If true (the default), special tokens (e.g. BOS) will be added to "
2632
+ "the prompt."
2633
+ ),
2634
+ )
2635
+ return_token_strs: bool | None = Field(
2636
+ default=False,
2637
+ description=(
2638
+ "If true, also return the token strings corresponding to the token ids."
2639
+ ),
2640
+ )
2641
+
2642
+
2643
+ class TokenizeChatRequest(OpenAIBaseModel):
2644
+ model: str | None = None
2645
+ messages: list[ChatCompletionMessageParam]
2646
+
2647
+ add_generation_prompt: bool = Field(
2648
+ default=True,
2649
+ description=(
2650
+ "If true, the generation prompt will be added to the chat template. "
2651
+ "This is a parameter used by chat template in tokenizer config of the "
2652
+ "model."
2653
+ ),
2654
+ )
2655
+ return_token_strs: bool | None = Field(
2656
+ default=False,
2657
+ description=(
2658
+ "If true, also return the token strings corresponding to the token ids."
2659
+ ),
2660
+ )
2661
+ continue_final_message: bool = Field(
2662
+ default=False,
2663
+ description=(
2664
+ "If this is set, the chat will be formatted so that the final "
2665
+ "message in the chat is open-ended, without any EOS tokens. The "
2666
+ "model will continue this message rather than starting a new one. "
2667
+ 'This allows you to "prefill" part of the model\'s response for it. '
2668
+ "Cannot be used at the same time as `add_generation_prompt`."
2669
+ ),
2670
+ )
2671
+ add_special_tokens: bool = Field(
2672
+ default=False,
2673
+ description=(
2674
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
2675
+ "on top of what is added by the chat template. "
2676
+ "For most models, the chat template takes care of adding the "
2677
+ "special tokens so this should be set to false (as is the "
2678
+ "default)."
2679
+ ),
2680
+ )
2681
+ chat_template: str | None = Field(
2682
+ default=None,
2683
+ description=(
2684
+ "A Jinja template to use for this conversion. "
2685
+ "As of transformers v4.44, default chat template is no longer "
2686
+ "allowed, so you must provide a chat template if the tokenizer "
2687
+ "does not define one."
2688
+ ),
2689
+ )
2690
+ chat_template_kwargs: dict[str, Any] | None = Field(
2691
+ default=None,
2692
+ description=(
2693
+ "Additional keyword args to pass to the template renderer. "
2694
+ "Will be accessible by the chat template."
2695
+ ),
2696
+ )
2697
+ mm_processor_kwargs: dict[str, Any] | None = Field(
2698
+ default=None,
2699
+ description=("Additional kwargs to pass to the HF processor."),
2700
+ )
2701
+ tools: list[ChatCompletionToolsParam] | None = Field(
2702
+ default=None,
2703
+ description=("A list of tools the model may call."),
2704
+ )
2705
+
2706
+ @model_validator(mode="before")
2707
+ @classmethod
2708
+ def check_generation_prompt(cls, data):
2709
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
2710
+ raise ValueError(
2711
+ "Cannot set both `continue_final_message` and "
2712
+ "`add_generation_prompt` to True."
2713
+ )
2714
+ return data
2715
+
2716
+
2717
+ TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
2718
+
2719
+
2720
+ class TokenizeResponse(OpenAIBaseModel):
2721
+ count: int
2722
+ max_model_len: int
2723
+ tokens: list[int]
2724
+ token_strs: list[str] | None = None
2725
+
2726
+
2727
+ class DetokenizeRequest(OpenAIBaseModel):
2728
+ model: str | None = None
2729
+ tokens: list[int]
2730
+
2731
+
2732
+ class DetokenizeResponse(OpenAIBaseModel):
2733
+ prompt: str
2734
+
2735
+
2736
+ class TokenizerInfoResponse(OpenAIBaseModel):
2737
+ """
2738
+ Response containing tokenizer configuration
2739
+ equivalent to tokenizer_config.json
2740
+ """
2741
+
2742
+ model_config = ConfigDict(extra="allow")
2743
+ tokenizer_class: str
2744
+
2745
+
2746
+ class LoadLoRAAdapterRequest(BaseModel):
2747
+ lora_name: str
2748
+ lora_path: str
2749
+
2750
+
2751
+ class UnloadLoRAAdapterRequest(BaseModel):
2752
+ lora_name: str
2753
+ lora_int_id: int | None = Field(default=None)
2754
+
2755
+
2756
+ ## Protocols for Audio
2757
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
2758
+
2759
+
2760
+ class TranscriptionRequest(OpenAIBaseModel):
2761
+ # Ordered by official OpenAI API documentation
2762
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
2763
+
2764
+ file: UploadFile
2765
+ """
2766
+ The audio file object (not file name) to transcribe, in one of these
2767
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2768
+ """
2769
+
2770
+ model: str | None = None
2771
+ """ID of the model to use.
2772
+ """
2773
+
2774
+ language: str | None = None
2775
+ """The language of the input audio.
2776
+
2777
+ Supplying the input language in
2778
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2779
+ will improve accuracy and latency.
2780
+ """
2781
+
2782
+ prompt: str = Field(default="")
2783
+ """An optional text to guide the model's style or continue a previous audio
2784
+ segment.
2785
+
2786
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2787
+ should match the audio language.
2788
+ """
2789
+
2790
+ response_format: AudioResponseFormat = Field(default="json")
2791
+ """
2792
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2793
+ `verbose_json`, or `vtt`.
2794
+ """
2795
+
2796
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
2797
+
2798
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
2799
+ alias="timestamp_granularities[]", default=[]
2800
+ )
2801
+ """The timestamp granularities to populate for this transcription.
2802
+
2803
+ `response_format` must be set `verbose_json` to use timestamp granularities.
2804
+ Either or both of these options are supported: `word`, or `segment`. Note:
2805
+ There is no additional latency for segment timestamps, but generating word
2806
+ timestamps incurs additional latency.
2807
+ """
2808
+
2809
+ stream: bool | None = False
2810
+ """When set, it will enable output to be streamed in a similar fashion
2811
+ as the Chat Completion endpoint.
2812
+ """
2813
+ # --8<-- [start:transcription-extra-params]
2814
+ # Flattened stream option to simplify form data.
2815
+ stream_include_usage: bool | None = False
2816
+ stream_continuous_usage_stats: bool | None = False
2817
+
2818
+ vllm_xargs: dict[str, str | int | float] | None = Field(
2819
+ default=None,
2820
+ description=(
2821
+ "Additional request parameters with string or "
2822
+ "numeric values, used by custom extensions."
2823
+ ),
2824
+ )
2825
+ # --8<-- [end:transcription-extra-params]
2826
+
2827
+ to_language: str | None = None
2828
+ """The language of the output audio we transcribe to.
2829
+
2830
+ Please note that this is not currently used by supported models at this
2831
+ time, but it is a placeholder for future use, matching translation api.
2832
+ """
2833
+
2834
+ # --8<-- [start:transcription-sampling-params]
2835
+ temperature: float = Field(default=0.0)
2836
+ """The sampling temperature, between 0 and 1.
2837
+
2838
+ Higher values like 0.8 will make the output more random, while lower values
2839
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2840
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2841
+ to automatically increase the temperature until certain thresholds are hit.
2842
+ """
2843
+
2844
+ top_p: float | None = None
2845
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
2846
+ smallest possible set whose cumulative probability exceeds `p`.
2847
+ """
2848
+
2849
+ top_k: int | None = None
2850
+ """Limits sampling to the `k` most probable tokens at each step."""
2851
+
2852
+ min_p: float | None = None
2853
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
2854
+ minimum likelihood threshold during sampling.
2855
+ """
2856
+
2857
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2858
+ """The seed to use for sampling."""
2859
+
2860
+ frequency_penalty: float | None = 0.0
2861
+ """The frequency penalty to use for sampling."""
2862
+
2863
+ repetition_penalty: float | None = None
2864
+ """The repetition penalty to use for sampling."""
2865
+
2866
+ presence_penalty: float | None = 0.0
2867
+ """The presence penalty to use for sampling."""
2868
+ # --8<-- [end:transcription-sampling-params]
2869
+
2870
+ # Default sampling parameters for transcription requests.
2871
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2872
+ "repetition_penalty": 1.0,
2873
+ "temperature": 1.0,
2874
+ "top_p": 1.0,
2875
+ "top_k": 0,
2876
+ "min_p": 0.0,
2877
+ }
2878
+
2879
+ def to_sampling_params(
2880
+ self, default_max_tokens: int, default_sampling_params: dict | None = None
2881
+ ) -> SamplingParams:
2882
+ max_tokens = default_max_tokens
2883
+
2884
+ if default_sampling_params is None:
2885
+ default_sampling_params = {}
2886
+
2887
+ # Default parameters
2888
+ if (temperature := self.temperature) is None:
2889
+ temperature = default_sampling_params.get(
2890
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
2891
+ )
2892
+ if (top_p := self.top_p) is None:
2893
+ top_p = default_sampling_params.get(
2894
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
2895
+ )
2896
+ if (top_k := self.top_k) is None:
2897
+ top_k = default_sampling_params.get(
2898
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
2899
+ )
2900
+ if (min_p := self.min_p) is None:
2901
+ min_p = default_sampling_params.get(
2902
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
2903
+ )
2904
+
2905
+ if (repetition_penalty := self.repetition_penalty) is None:
2906
+ repetition_penalty = default_sampling_params.get(
2907
+ "repetition_penalty",
2908
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
2909
+ )
2910
+
2911
+ return SamplingParams.from_optional(
2912
+ temperature=temperature,
2913
+ max_tokens=max_tokens,
2914
+ seed=self.seed,
2915
+ top_p=top_p,
2916
+ top_k=top_k,
2917
+ min_p=min_p,
2918
+ frequency_penalty=self.frequency_penalty,
2919
+ repetition_penalty=repetition_penalty,
2920
+ presence_penalty=self.presence_penalty,
2921
+ output_kind=RequestOutputKind.DELTA
2922
+ if self.stream
2923
+ else RequestOutputKind.FINAL_ONLY,
2924
+ extra_args=self.vllm_xargs,
2925
+ )
2926
+
2927
+ @model_validator(mode="before")
2928
+ @classmethod
2929
+ def validate_transcription_request(cls, data):
2930
+ if isinstance(data.get("file"), str):
2931
+ raise HTTPException(
2932
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
2933
+ detail="Expected 'file' to be a file-like object, not 'str'.",
2934
+ )
2935
+
2936
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2937
+ stream = data.get("stream", False)
2938
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2939
+ raise ValueError("Stream options can only be defined when `stream=True`.")
2940
+
2941
+ return data
2942
+
2943
+
2944
+ # Transcription response objects
2945
+ class TranscriptionUsageAudio(OpenAIBaseModel):
2946
+ type: Literal["duration"] = "duration"
2947
+ seconds: int
2948
+
2949
+
2950
+ class TranscriptionResponse(OpenAIBaseModel):
2951
+ text: str
2952
+ """The transcribed text."""
2953
+ usage: TranscriptionUsageAudio
2954
+
2955
+
2956
+ class TranscriptionWord(OpenAIBaseModel):
2957
+ end: float
2958
+ """End time of the word in seconds."""
2959
+
2960
+ start: float
2961
+ """Start time of the word in seconds."""
2962
+
2963
+ word: str
2964
+ """The text content of the word."""
2965
+
2966
+
2967
+ class TranscriptionSegment(OpenAIBaseModel):
2968
+ id: int
2969
+ """Unique identifier of the segment."""
2970
+
2971
+ avg_logprob: float
2972
+ """Average logprob of the segment.
2973
+
2974
+ If the value is lower than -1, consider the logprobs failed.
2975
+ """
2976
+
2977
+ compression_ratio: float
2978
+ """Compression ratio of the segment.
2979
+
2980
+ If the value is greater than 2.4, consider the compression failed.
2981
+ """
2982
+
2983
+ end: float
2984
+ """End time of the segment in seconds."""
2985
+
2986
+ no_speech_prob: float
2987
+ """Probability of no speech in the segment.
2988
+
2989
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2990
+ this segment silent.
2991
+ """
2992
+
2993
+ seek: int
2994
+ """Seek offset of the segment."""
2995
+
2996
+ start: float
2997
+ """Start time of the segment in seconds."""
2998
+
2999
+ temperature: float
3000
+ """Temperature parameter used for generating the segment."""
3001
+
3002
+ text: str
3003
+ """Text content of the segment."""
3004
+
3005
+ tokens: list[int]
3006
+ """Array of token IDs for the text content."""
3007
+
3008
+
3009
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
3010
+ duration: str
3011
+ """The duration of the input audio."""
3012
+
3013
+ language: str
3014
+ """The language of the input audio."""
3015
+
3016
+ text: str
3017
+ """The transcribed text."""
3018
+
3019
+ segments: list[TranscriptionSegment] | None = None
3020
+ """Segments of the transcribed text and their corresponding details."""
3021
+
3022
+ words: list[TranscriptionWord] | None = None
3023
+ """Extracted words and their corresponding timestamps."""
3024
+
3025
+
3026
+ class TranslationResponseStreamChoice(OpenAIBaseModel):
3027
+ delta: DeltaMessage
3028
+ finish_reason: str | None = None
3029
+ stop_reason: int | str | None = None
3030
+
3031
+
3032
+ class TranslationStreamResponse(OpenAIBaseModel):
3033
+ id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
3034
+ object: Literal["translation.chunk"] = "translation.chunk"
3035
+ created: int = Field(default_factory=lambda: int(time.time()))
3036
+ model: str
3037
+ choices: list[TranslationResponseStreamChoice]
3038
+ usage: UsageInfo | None = Field(default=None)
3039
+
3040
+
3041
+ class TranslationRequest(OpenAIBaseModel):
3042
+ # Ordered by official OpenAI API documentation
3043
+ # https://platform.openai.com/docs/api-reference/audio/createTranslation
3044
+
3045
+ file: UploadFile
3046
+ """
3047
+ The audio file object (not file name) to translate, in one of these
3048
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
3049
+ """
3050
+
3051
+ model: str | None = None
3052
+ """ID of the model to use.
3053
+ """
3054
+
3055
+ prompt: str = Field(default="")
3056
+ """An optional text to guide the model's style or continue a previous audio
3057
+ segment.
3058
+
3059
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
3060
+ should match the audio language.
3061
+ """
3062
+
3063
+ response_format: AudioResponseFormat = Field(default="json")
3064
+ """
3065
+ The format of the output, in one of these options: `json`, `text`, `srt`,
3066
+ `verbose_json`, or `vtt`.
3067
+ """
3068
+
3069
+ # TODO support additional sampling parameters
3070
+ # --8<-- [start:translation-sampling-params]
3071
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
3072
+ """The seed to use for sampling."""
3073
+
3074
+ temperature: float = Field(default=0.0)
3075
+ """The sampling temperature, between 0 and 1.
3076
+
3077
+ Higher values like 0.8 will make the output more random, while lower values
3078
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
3079
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
3080
+ to automatically increase the temperature until certain thresholds are hit.
3081
+ """
3082
+ # --8<-- [end:translation-sampling-params]
3083
+
3084
+ # --8<-- [start:translation-extra-params]
3085
+ language: str | None = None
3086
+ """The language of the input audio we translate from.
3087
+
3088
+ Supplying the input language in
3089
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
3090
+ will improve accuracy.
3091
+ """
3092
+
3093
+ to_language: str | None = None
3094
+ """The language of the input audio we translate to.
3095
+
3096
+ Please note that this is not supported by all models, refer to the specific
3097
+ model documentation for more details.
3098
+ For instance, Whisper only supports `to_language=en`.
3099
+ """
3100
+
3101
+ stream: bool | None = False
3102
+ """Custom field not present in the original OpenAI definition. When set,
3103
+ it will enable output to be streamed in a similar fashion as the Chat
3104
+ Completion endpoint.
3105
+ """
3106
+ # Flattened stream option to simplify form data.
3107
+ stream_include_usage: bool | None = False
3108
+ stream_continuous_usage_stats: bool | None = False
3109
+ # --8<-- [end:translation-extra-params]
3110
+
3111
+ # Default sampling parameters for translation requests.
3112
+ _DEFAULT_SAMPLING_PARAMS: dict = {
3113
+ "temperature": 0,
3114
+ }
3115
+
3116
+ def to_sampling_params(
3117
+ self, default_max_tokens: int, default_sampling_params: dict | None = None
3118
+ ) -> SamplingParams:
3119
+ max_tokens = default_max_tokens
3120
+
3121
+ if default_sampling_params is None:
3122
+ default_sampling_params = {}
3123
+ # Default parameters
3124
+ if (temperature := self.temperature) is None:
3125
+ temperature = default_sampling_params.get(
3126
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
3127
+ )
3128
+
3129
+ return SamplingParams.from_optional(
3130
+ temperature=temperature,
3131
+ max_tokens=max_tokens,
3132
+ seed=self.seed,
3133
+ output_kind=RequestOutputKind.DELTA
3134
+ if self.stream
3135
+ else RequestOutputKind.FINAL_ONLY,
3136
+ )
3137
+
3138
+ @model_validator(mode="before")
3139
+ @classmethod
3140
+ def validate_stream_options(cls, data):
3141
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
3142
+ stream = data.get("stream", False)
3143
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
3144
+ raise ValueError("Stream options can only be defined when `stream=True`.")
3145
+
3146
+ return data
3147
+
3148
+
3149
+ # Translation response objects
3150
+ class TranslationResponse(OpenAIBaseModel):
3151
+ text: str
3152
+ """The translated text."""
3153
+
3154
+
3155
+ class TranslationWord(OpenAIBaseModel):
3156
+ end: float
3157
+ """End time of the word in seconds."""
3158
+
3159
+ start: float
3160
+ """Start time of the word in seconds."""
3161
+
3162
+ word: str
3163
+ """The text content of the word."""
3164
+
3165
+
3166
+ class TranslationSegment(OpenAIBaseModel):
3167
+ id: int
3168
+ """Unique identifier of the segment."""
3169
+
3170
+ avg_logprob: float
3171
+ """Average logprob of the segment.
3172
+
3173
+ If the value is lower than -1, consider the logprobs failed.
3174
+ """
3175
+
3176
+ compression_ratio: float
3177
+ """Compression ratio of the segment.
3178
+
3179
+ If the value is greater than 2.4, consider the compression failed.
3180
+ """
3181
+
3182
+ end: float
3183
+ """End time of the segment in seconds."""
3184
+
3185
+ no_speech_prob: float
3186
+ """Probability of no speech in the segment.
3187
+
3188
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
3189
+ this segment silent.
3190
+ """
3191
+
3192
+ seek: int
3193
+ """Seek offset of the segment."""
3194
+
3195
+ start: float
3196
+ """Start time of the segment in seconds."""
3197
+
3198
+ temperature: float
3199
+ """Temperature parameter used for generating the segment."""
3200
+
3201
+ text: str
3202
+ """Text content of the segment."""
3203
+
3204
+ tokens: list[int]
3205
+ """Array of token IDs for the text content."""
3206
+
3207
+
3208
+ class TranslationResponseVerbose(OpenAIBaseModel):
3209
+ duration: str
3210
+ """The duration of the input audio."""
3211
+
3212
+ language: str
3213
+ """The language of the input audio."""
3214
+
3215
+ text: str
3216
+ """The translated text."""
3217
+
3218
+ segments: list[TranslationSegment] | None = None
3219
+ """Segments of the translated text and their corresponding details."""
3220
+
3221
+ words: list[TranslationWord] | None = None
3222
+ """Extracted words and their corresponding timestamps."""
3223
+
3224
+
3225
+ ####### Tokens IN <> Tokens OUT #######
3226
+ class GenerateRequest(BaseModel):
3227
+ request_id: str = Field(
3228
+ default_factory=lambda: f"{random_uuid()}",
3229
+ description=(
3230
+ "The request_id related to this request. If the caller does "
3231
+ "not set it, a random_uuid will be generated. This id is used "
3232
+ "through out the inference process and return in response."
3233
+ ),
3234
+ )
3235
+ token_ids: list[int]
3236
+ """The token ids to generate text from."""
3237
+
3238
+ # features: MultiModalFeatureSpec
3239
+ # TODO (NickLucche): implement once Renderer work is completed
3240
+ features: str | None = None
3241
+ """The processed MM inputs for the model."""
3242
+
3243
+ sampling_params: SamplingParams
3244
+ """The sampling parameters for the model."""
3245
+
3246
+ model: str | None = None
3247
+
3248
+ stream: bool | None = False
3249
+ stream_options: StreamOptions | None = None
3250
+ cache_salt: str | None = Field(
3251
+ default=None,
3252
+ description=(
3253
+ "If specified, the prefix cache will be salted with the provided "
3254
+ "string to prevent an attacker to guess prompts in multi-user "
3255
+ "environments. The salt should be random, protected from "
3256
+ "access by 3rd parties, and long enough to be "
3257
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
3258
+ "to 256 bit)."
3259
+ ),
3260
+ )
3261
+ priority: int = Field(
3262
+ default=0,
3263
+ description=(
3264
+ "The priority of the request (lower means earlier handling; "
3265
+ "default: 0). Any priority other than 0 will raise an error "
3266
+ "if the served model does not use priority scheduling."
3267
+ ),
3268
+ )
3269
+ kv_transfer_params: dict[str, Any] | None = Field(
3270
+ default=None,
3271
+ description="KVTransfer parameters used for disaggregated serving.",
3272
+ )
3273
+
3274
+
3275
+ class GenerateResponseChoice(BaseModel):
3276
+ index: int
3277
+ logprobs: ChatCompletionLogProbs | None = None
3278
+ # per OpenAI spec this is the default
3279
+ finish_reason: str | None = "stop"
3280
+ token_ids: list[int] | None = None
3281
+
3282
+
3283
+ class GenerateResponse(BaseModel):
3284
+ request_id: str = Field(
3285
+ default_factory=lambda: f"{random_uuid()}",
3286
+ description=(
3287
+ "The request_id related to this request. If the caller does "
3288
+ "not set it, a random_uuid will be generated. This id is used "
3289
+ "through out the inference process and return in response."
3290
+ ),
3291
+ )
3292
+ choices: list[GenerateResponseChoice]
3293
+
3294
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
3295
+
3296
+ kv_transfer_params: dict[str, Any] | None = Field(
3297
+ default=None,
3298
+ description="KVTransfer parameters used for disaggregated serving.",
3299
+ )