vllm-cpu 0.12.0__cp313-cp313-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1600) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +107 -0
  3. vllm/_aiter_ops.py +1018 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +2925 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +434 -0
  16. vllm/attention/backends/registry.py +286 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +975 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +120 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/ops/__init__.py +0 -0
  24. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  25. vllm/attention/ops/common.py +469 -0
  26. vllm/attention/ops/flashmla.py +251 -0
  27. vllm/attention/ops/merge_attn_states.py +47 -0
  28. vllm/attention/ops/paged_attn.py +51 -0
  29. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  30. vllm/attention/ops/prefix_prefill.py +814 -0
  31. vllm/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  32. vllm/attention/ops/triton_decode_attention.py +712 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +116 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  35. vllm/attention/ops/triton_unified_attention.py +941 -0
  36. vllm/attention/ops/vit_attn_wrappers.py +136 -0
  37. vllm/attention/selector.py +268 -0
  38. vllm/attention/utils/__init__.py +0 -0
  39. vllm/attention/utils/fa_utils.py +117 -0
  40. vllm/attention/utils/kv_sharing_utils.py +33 -0
  41. vllm/attention/utils/kv_transfer_utils.py +60 -0
  42. vllm/beam_search.py +88 -0
  43. vllm/benchmarks/__init__.py +0 -0
  44. vllm/benchmarks/datasets.py +3222 -0
  45. vllm/benchmarks/latency.py +172 -0
  46. vllm/benchmarks/lib/__init__.py +3 -0
  47. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  48. vllm/benchmarks/lib/ready_checker.py +72 -0
  49. vllm/benchmarks/lib/utils.py +79 -0
  50. vllm/benchmarks/serve.py +1531 -0
  51. vllm/benchmarks/sweep/__init__.py +0 -0
  52. vllm/benchmarks/sweep/cli.py +41 -0
  53. vllm/benchmarks/sweep/param_sweep.py +91 -0
  54. vllm/benchmarks/sweep/plot.py +580 -0
  55. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  56. vllm/benchmarks/sweep/serve.py +448 -0
  57. vllm/benchmarks/sweep/serve_sla.py +492 -0
  58. vllm/benchmarks/sweep/server.py +114 -0
  59. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  60. vllm/benchmarks/sweep/utils.py +4 -0
  61. vllm/benchmarks/throughput.py +799 -0
  62. vllm/collect_env.py +857 -0
  63. vllm/compilation/__init__.py +0 -0
  64. vllm/compilation/activation_quant_fusion.py +209 -0
  65. vllm/compilation/backends.py +827 -0
  66. vllm/compilation/base_static_graph.py +57 -0
  67. vllm/compilation/caching.py +180 -0
  68. vllm/compilation/collective_fusion.py +1234 -0
  69. vllm/compilation/compiler_interface.py +639 -0
  70. vllm/compilation/counter.py +48 -0
  71. vllm/compilation/cuda_graph.py +208 -0
  72. vllm/compilation/decorators.py +614 -0
  73. vllm/compilation/fix_functionalization.py +253 -0
  74. vllm/compilation/fusion.py +374 -0
  75. vllm/compilation/fusion_attn.py +359 -0
  76. vllm/compilation/fx_utils.py +91 -0
  77. vllm/compilation/inductor_pass.py +133 -0
  78. vllm/compilation/matcher_utils.py +315 -0
  79. vllm/compilation/monitor.py +62 -0
  80. vllm/compilation/noop_elimination.py +134 -0
  81. vllm/compilation/partition_rules.py +72 -0
  82. vllm/compilation/pass_manager.py +136 -0
  83. vllm/compilation/piecewise_backend.py +121 -0
  84. vllm/compilation/post_cleanup.py +21 -0
  85. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  86. vllm/compilation/sequence_parallelism.py +363 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  88. vllm/compilation/vllm_inductor_pass.py +173 -0
  89. vllm/compilation/wrapper.py +260 -0
  90. vllm/config/__init__.py +102 -0
  91. vllm/config/cache.py +220 -0
  92. vllm/config/compilation.py +1154 -0
  93. vllm/config/device.py +75 -0
  94. vllm/config/ec_transfer.py +110 -0
  95. vllm/config/kv_events.py +56 -0
  96. vllm/config/kv_transfer.py +114 -0
  97. vllm/config/load.py +124 -0
  98. vllm/config/lora.py +96 -0
  99. vllm/config/model.py +2274 -0
  100. vllm/config/multimodal.py +247 -0
  101. vllm/config/observability.py +131 -0
  102. vllm/config/parallel.py +653 -0
  103. vllm/config/pooler.py +124 -0
  104. vllm/config/scheduler.py +297 -0
  105. vllm/config/speculative.py +643 -0
  106. vllm/config/speech_to_text.py +38 -0
  107. vllm/config/structured_outputs.py +94 -0
  108. vllm/config/utils.py +324 -0
  109. vllm/config/vllm.py +1353 -0
  110. vllm/connections.py +189 -0
  111. vllm/device_allocator/__init__.py +0 -0
  112. vllm/device_allocator/cumem.py +327 -0
  113. vllm/distributed/__init__.py +6 -0
  114. vllm/distributed/communication_op.py +43 -0
  115. vllm/distributed/device_communicators/__init__.py +0 -0
  116. vllm/distributed/device_communicators/all2all.py +490 -0
  117. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  118. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  119. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  120. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  121. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  122. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  123. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  124. vllm/distributed/device_communicators/pynccl.py +386 -0
  125. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  126. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  127. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  128. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  129. vllm/distributed/device_communicators/shm_broadcast.py +733 -0
  130. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  131. vllm/distributed/device_communicators/symm_mem.py +156 -0
  132. vllm/distributed/device_communicators/tpu_communicator.py +99 -0
  133. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  134. vllm/distributed/ec_transfer/__init__.py +14 -0
  135. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  136. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  137. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  138. vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +201 -0
  139. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  140. vllm/distributed/eplb/__init__.py +8 -0
  141. vllm/distributed/eplb/async_worker.py +115 -0
  142. vllm/distributed/eplb/eplb_state.py +1154 -0
  143. vllm/distributed/eplb/rebalance_algo.py +260 -0
  144. vllm/distributed/eplb/rebalance_execute.py +532 -0
  145. vllm/distributed/kv_events.py +371 -0
  146. vllm/distributed/kv_transfer/README.md +29 -0
  147. vllm/distributed/kv_transfer/__init__.py +20 -0
  148. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  150. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  151. vllm/distributed/kv_transfer/kv_connector/factory.py +192 -0
  152. vllm/distributed/kv_transfer/kv_connector/utils.py +268 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/base.py +575 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +216 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +378 -0
  159. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1411 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +895 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +189 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +454 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2480 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +538 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  169. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  170. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +450 -0
  171. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  172. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +179 -0
  173. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +164 -0
  174. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +242 -0
  175. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  176. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  177. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +295 -0
  178. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +285 -0
  179. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  180. vllm/distributed/parallel_state.py +1790 -0
  181. vllm/distributed/tpu_distributed_utils.py +188 -0
  182. vllm/distributed/utils.py +545 -0
  183. vllm/engine/__init__.py +0 -0
  184. vllm/engine/arg_utils.py +2106 -0
  185. vllm/engine/async_llm_engine.py +6 -0
  186. vllm/engine/llm_engine.py +6 -0
  187. vllm/engine/protocol.py +188 -0
  188. vllm/entrypoints/__init__.py +0 -0
  189. vllm/entrypoints/anthropic/__init__.py +0 -0
  190. vllm/entrypoints/anthropic/protocol.py +162 -0
  191. vllm/entrypoints/anthropic/serving_messages.py +460 -0
  192. vllm/entrypoints/api_server.py +184 -0
  193. vllm/entrypoints/chat_utils.py +1837 -0
  194. vllm/entrypoints/cli/__init__.py +13 -0
  195. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  196. vllm/entrypoints/cli/benchmark/base.py +25 -0
  197. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  198. vllm/entrypoints/cli/benchmark/main.py +56 -0
  199. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  200. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  201. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  202. vllm/entrypoints/cli/collect_env.py +38 -0
  203. vllm/entrypoints/cli/main.py +79 -0
  204. vllm/entrypoints/cli/openai.py +256 -0
  205. vllm/entrypoints/cli/run_batch.py +68 -0
  206. vllm/entrypoints/cli/serve.py +249 -0
  207. vllm/entrypoints/cli/types.py +29 -0
  208. vllm/entrypoints/constants.py +10 -0
  209. vllm/entrypoints/context.py +572 -0
  210. vllm/entrypoints/dynamic_lora.py +57 -0
  211. vllm/entrypoints/harmony_utils.py +535 -0
  212. vllm/entrypoints/launcher.py +175 -0
  213. vllm/entrypoints/llm.py +1762 -0
  214. vllm/entrypoints/logger.py +84 -0
  215. vllm/entrypoints/openai/__init__.py +0 -0
  216. vllm/entrypoints/openai/api_server.py +1891 -0
  217. vllm/entrypoints/openai/cli_args.py +302 -0
  218. vllm/entrypoints/openai/orca_metrics.py +120 -0
  219. vllm/entrypoints/openai/protocol.py +2465 -0
  220. vllm/entrypoints/openai/run_batch.py +631 -0
  221. vllm/entrypoints/openai/serving_chat.py +1782 -0
  222. vllm/entrypoints/openai/serving_completion.py +716 -0
  223. vllm/entrypoints/openai/serving_engine.py +1478 -0
  224. vllm/entrypoints/openai/serving_models.py +304 -0
  225. vllm/entrypoints/openai/serving_responses.py +2032 -0
  226. vllm/entrypoints/openai/serving_tokenization.py +203 -0
  227. vllm/entrypoints/openai/serving_tokens.py +281 -0
  228. vllm/entrypoints/openai/serving_transcription.py +168 -0
  229. vllm/entrypoints/openai/speech_to_text.py +559 -0
  230. vllm/entrypoints/openai/tool_parsers/__init__.py +142 -0
  231. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +273 -0
  232. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +390 -0
  233. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +390 -0
  234. vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +210 -0
  235. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +200 -0
  236. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  237. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +253 -0
  238. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +494 -0
  239. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  240. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +227 -0
  241. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +322 -0
  242. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +590 -0
  243. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  244. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +324 -0
  245. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +37 -0
  246. vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +643 -0
  247. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +849 -0
  248. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +390 -0
  249. vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py +366 -0
  250. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +97 -0
  251. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +120 -0
  252. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +332 -0
  253. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +781 -0
  254. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  255. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +744 -0
  256. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +303 -0
  257. vllm/entrypoints/openai/tool_parsers/utils.py +229 -0
  258. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +556 -0
  259. vllm/entrypoints/openai/utils.py +49 -0
  260. vllm/entrypoints/pooling/__init__.py +16 -0
  261. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  262. vllm/entrypoints/pooling/classify/api_router.py +50 -0
  263. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  264. vllm/entrypoints/pooling/classify/serving.py +237 -0
  265. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  266. vllm/entrypoints/pooling/embed/api_router.py +67 -0
  267. vllm/entrypoints/pooling/embed/protocol.py +208 -0
  268. vllm/entrypoints/pooling/embed/serving.py +697 -0
  269. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  270. vllm/entrypoints/pooling/pooling/api_router.py +63 -0
  271. vllm/entrypoints/pooling/pooling/protocol.py +148 -0
  272. vllm/entrypoints/pooling/pooling/serving.py +348 -0
  273. vllm/entrypoints/pooling/score/__init__.py +0 -0
  274. vllm/entrypoints/pooling/score/api_router.py +149 -0
  275. vllm/entrypoints/pooling/score/protocol.py +145 -0
  276. vllm/entrypoints/pooling/score/serving.py +505 -0
  277. vllm/entrypoints/renderer.py +409 -0
  278. vllm/entrypoints/responses_utils.py +148 -0
  279. vllm/entrypoints/sagemaker/__init__.py +4 -0
  280. vllm/entrypoints/sagemaker/routes.py +118 -0
  281. vllm/entrypoints/score_utils.py +240 -0
  282. vllm/entrypoints/ssl.py +78 -0
  283. vllm/entrypoints/tool.py +143 -0
  284. vllm/entrypoints/tool_server.py +234 -0
  285. vllm/entrypoints/utils.py +319 -0
  286. vllm/env_override.py +378 -0
  287. vllm/envs.py +1710 -0
  288. vllm/forward_context.py +358 -0
  289. vllm/inputs/__init__.py +44 -0
  290. vllm/inputs/data.py +359 -0
  291. vllm/inputs/parse.py +137 -0
  292. vllm/inputs/preprocess.py +716 -0
  293. vllm/logger.py +298 -0
  294. vllm/logging_utils/__init__.py +13 -0
  295. vllm/logging_utils/dump_input.py +83 -0
  296. vllm/logging_utils/formatter.py +127 -0
  297. vllm/logging_utils/lazy.py +20 -0
  298. vllm/logging_utils/log_time.py +34 -0
  299. vllm/logits_process.py +121 -0
  300. vllm/logprobs.py +206 -0
  301. vllm/lora/__init__.py +0 -0
  302. vllm/lora/layers/__init__.py +42 -0
  303. vllm/lora/layers/base.py +66 -0
  304. vllm/lora/layers/base_linear.py +165 -0
  305. vllm/lora/layers/column_parallel_linear.py +577 -0
  306. vllm/lora/layers/fused_moe.py +747 -0
  307. vllm/lora/layers/logits_processor.py +203 -0
  308. vllm/lora/layers/replicated_linear.py +70 -0
  309. vllm/lora/layers/row_parallel_linear.py +176 -0
  310. vllm/lora/layers/utils.py +74 -0
  311. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  312. vllm/lora/lora_weights.py +227 -0
  313. vllm/lora/models.py +903 -0
  314. vllm/lora/ops/__init__.py +0 -0
  315. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  316. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  317. vllm/lora/ops/torch_ops/__init__.py +20 -0
  318. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  319. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  320. vllm/lora/ops/triton_ops/__init__.py +21 -0
  321. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +661 -0
  322. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  323. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  324. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  325. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  326. vllm/lora/ops/triton_ops/utils.py +295 -0
  327. vllm/lora/ops/xla_ops/__init__.py +6 -0
  328. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  329. vllm/lora/peft_helper.py +128 -0
  330. vllm/lora/punica_wrapper/__init__.py +10 -0
  331. vllm/lora/punica_wrapper/punica_base.py +493 -0
  332. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  333. vllm/lora/punica_wrapper/punica_gpu.py +412 -0
  334. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  335. vllm/lora/punica_wrapper/punica_tpu.py +358 -0
  336. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  337. vllm/lora/punica_wrapper/utils.py +150 -0
  338. vllm/lora/request.py +100 -0
  339. vllm/lora/resolver.py +88 -0
  340. vllm/lora/utils.py +306 -0
  341. vllm/lora/worker_manager.py +268 -0
  342. vllm/model_executor/__init__.py +11 -0
  343. vllm/model_executor/custom_op.py +194 -0
  344. vllm/model_executor/layers/__init__.py +0 -0
  345. vllm/model_executor/layers/activation.py +595 -0
  346. vllm/model_executor/layers/attention_layer_base.py +32 -0
  347. vllm/model_executor/layers/batch_invariant.py +1058 -0
  348. vllm/model_executor/layers/conv.py +256 -0
  349. vllm/model_executor/layers/fla/__init__.py +8 -0
  350. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  351. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  352. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  353. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  354. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  355. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  356. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  357. vllm/model_executor/layers/fla/ops/index.py +41 -0
  358. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  359. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  360. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  361. vllm/model_executor/layers/fla/ops/op.py +60 -0
  362. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  363. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  364. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  365. vllm/model_executor/layers/fused_moe/__init__.py +110 -0
  366. vllm/model_executor/layers/fused_moe/all2all_utils.py +171 -0
  367. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +406 -0
  368. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +180 -0
  369. vllm/model_executor/layers/fused_moe/config.py +938 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  645. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +292 -0
  646. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1052 -0
  647. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +387 -0
  648. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +416 -0
  649. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  650. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +434 -0
  651. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +376 -0
  652. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  653. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  654. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  655. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  656. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +821 -0
  657. vllm/model_executor/layers/fused_moe/fused_moe.py +2172 -0
  658. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +121 -0
  659. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +136 -0
  660. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +524 -0
  661. vllm/model_executor/layers/fused_moe/layer.py +2152 -0
  662. vllm/model_executor/layers/fused_moe/modular_kernel.py +1332 -0
  663. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +174 -0
  664. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  665. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  666. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  667. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  668. vllm/model_executor/layers/fused_moe/prepare_finalize.py +78 -0
  669. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  670. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  671. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  672. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  673. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  674. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  675. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +559 -0
  676. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  677. vllm/model_executor/layers/kda.py +442 -0
  678. vllm/model_executor/layers/layernorm.py +442 -0
  679. vllm/model_executor/layers/lightning_attn.py +735 -0
  680. vllm/model_executor/layers/linear.py +1424 -0
  681. vllm/model_executor/layers/logits_processor.py +106 -0
  682. vllm/model_executor/layers/mamba/__init__.py +0 -0
  683. vllm/model_executor/layers/mamba/abstract.py +68 -0
  684. vllm/model_executor/layers/mamba/linear_attn.py +388 -0
  685. vllm/model_executor/layers/mamba/mamba_mixer.py +527 -0
  686. vllm/model_executor/layers/mamba/mamba_mixer2.py +930 -0
  687. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  688. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  689. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  690. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  691. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +478 -0
  692. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  693. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  694. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  695. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  696. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  697. vllm/model_executor/layers/mamba/short_conv.py +255 -0
  698. vllm/model_executor/layers/mla.py +176 -0
  699. vllm/model_executor/layers/pooler.py +817 -0
  700. vllm/model_executor/layers/quantization/__init__.py +179 -0
  701. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  702. vllm/model_executor/layers/quantization/awq.py +277 -0
  703. vllm/model_executor/layers/quantization/awq_marlin.py +718 -0
  704. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  705. vllm/model_executor/layers/quantization/base_config.py +170 -0
  706. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  707. vllm/model_executor/layers/quantization/bitsandbytes.py +644 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +963 -0
  710. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2387 -0
  711. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  712. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  713. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  714. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  715. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  716. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  717. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +183 -0
  718. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  719. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  720. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  721. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  722. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  723. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  724. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  725. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  726. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  727. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  728. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  729. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  730. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  731. vllm/model_executor/layers/quantization/cpu_wna16.py +625 -0
  732. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  733. vllm/model_executor/layers/quantization/experts_int8.py +225 -0
  734. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  735. vllm/model_executor/layers/quantization/fp8.py +1348 -0
  736. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  737. vllm/model_executor/layers/quantization/gguf.py +687 -0
  738. vllm/model_executor/layers/quantization/gptq.py +393 -0
  739. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  740. vllm/model_executor/layers/quantization/gptq_marlin.py +842 -0
  741. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  742. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  743. vllm/model_executor/layers/quantization/inc.py +65 -0
  744. vllm/model_executor/layers/quantization/input_quant_fp8.py +171 -0
  745. vllm/model_executor/layers/quantization/ipex_quant.py +470 -0
  746. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  747. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  748. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +105 -0
  749. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  750. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  751. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  752. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +119 -0
  753. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  754. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  755. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  756. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  757. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +73 -0
  758. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +97 -0
  759. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  760. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +219 -0
  761. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +140 -0
  762. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +42 -0
  763. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  764. vllm/model_executor/layers/quantization/kv_cache.py +146 -0
  765. vllm/model_executor/layers/quantization/modelopt.py +1637 -0
  766. vllm/model_executor/layers/quantization/moe_wna16.py +528 -0
  767. vllm/model_executor/layers/quantization/mxfp4.py +1175 -0
  768. vllm/model_executor/layers/quantization/petit.py +319 -0
  769. vllm/model_executor/layers/quantization/ptpc_fp8.py +136 -0
  770. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  771. vllm/model_executor/layers/quantization/quark/quark.py +527 -0
  772. vllm/model_executor/layers/quantization/quark/quark_moe.py +653 -0
  773. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  774. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  775. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  776. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  777. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  778. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  779. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  780. vllm/model_executor/layers/quantization/rtn.py +639 -0
  781. vllm/model_executor/layers/quantization/schema.py +90 -0
  782. vllm/model_executor/layers/quantization/torchao.py +380 -0
  783. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  784. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  785. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  786. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1002. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +333 -0
  1003. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +311 -0
  1004. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1203 -0
  1005. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1006. vllm/model_executor/layers/quantization/utils/int8_utils.py +489 -0
  1007. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1008. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1009. vllm/model_executor/layers/quantization/utils/marlin_utils.py +674 -0
  1010. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +452 -0
  1011. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1012. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1013. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1014. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +183 -0
  1015. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1016. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1017. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1018. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1019. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1020. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1021. vllm/model_executor/layers/quantization/utils/quant_utils.py +687 -0
  1022. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +516 -0
  1023. vllm/model_executor/layers/resampler.py +283 -0
  1024. vllm/model_executor/layers/rotary_embedding/__init__.py +292 -0
  1025. vllm/model_executor/layers/rotary_embedding/base.py +240 -0
  1026. vllm/model_executor/layers/rotary_embedding/common.py +188 -0
  1027. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1028. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1029. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1030. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1031. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +75 -0
  1032. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1033. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1034. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1035. vllm/model_executor/layers/rotary_embedding/mrope.py +397 -0
  1036. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1037. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1038. vllm/model_executor/layers/rotary_embedding/xdrope.py +102 -0
  1039. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1040. vllm/model_executor/layers/utils.py +251 -0
  1041. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1042. vllm/model_executor/model_loader/__init__.py +150 -0
  1043. vllm/model_executor/model_loader/base_loader.py +57 -0
  1044. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1045. vllm/model_executor/model_loader/default_loader.py +321 -0
  1046. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1047. vllm/model_executor/model_loader/gguf_loader.py +349 -0
  1048. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1049. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1050. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1051. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1052. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1053. vllm/model_executor/model_loader/tpu.py +118 -0
  1054. vllm/model_executor/model_loader/utils.py +296 -0
  1055. vllm/model_executor/model_loader/weight_utils.py +1147 -0
  1056. vllm/model_executor/models/__init__.py +44 -0
  1057. vllm/model_executor/models/adapters.py +543 -0
  1058. vllm/model_executor/models/afmoe.py +697 -0
  1059. vllm/model_executor/models/aimv2.py +248 -0
  1060. vllm/model_executor/models/apertus.py +569 -0
  1061. vllm/model_executor/models/arcee.py +428 -0
  1062. vllm/model_executor/models/arctic.py +634 -0
  1063. vllm/model_executor/models/aria.py +655 -0
  1064. vllm/model_executor/models/aya_vision.py +450 -0
  1065. vllm/model_executor/models/baichuan.py +494 -0
  1066. vllm/model_executor/models/bailing_moe.py +645 -0
  1067. vllm/model_executor/models/bamba.py +516 -0
  1068. vllm/model_executor/models/bee.py +157 -0
  1069. vllm/model_executor/models/bert.py +925 -0
  1070. vllm/model_executor/models/bert_with_rope.py +732 -0
  1071. vllm/model_executor/models/blip.py +350 -0
  1072. vllm/model_executor/models/blip2.py +695 -0
  1073. vllm/model_executor/models/bloom.py +390 -0
  1074. vllm/model_executor/models/chameleon.py +1098 -0
  1075. vllm/model_executor/models/chatglm.py +499 -0
  1076. vllm/model_executor/models/clip.py +1005 -0
  1077. vllm/model_executor/models/cohere2_vision.py +472 -0
  1078. vllm/model_executor/models/commandr.py +470 -0
  1079. vllm/model_executor/models/config.py +510 -0
  1080. vllm/model_executor/models/dbrx.py +485 -0
  1081. vllm/model_executor/models/deepencoder.py +676 -0
  1082. vllm/model_executor/models/deepseek_eagle.py +252 -0
  1083. vllm/model_executor/models/deepseek_mtp.py +446 -0
  1084. vllm/model_executor/models/deepseek_ocr.py +593 -0
  1085. vllm/model_executor/models/deepseek_v2.py +1715 -0
  1086. vllm/model_executor/models/deepseek_vl2.py +644 -0
  1087. vllm/model_executor/models/dots1.py +566 -0
  1088. vllm/model_executor/models/dots_ocr.py +874 -0
  1089. vllm/model_executor/models/ernie45.py +53 -0
  1090. vllm/model_executor/models/ernie45_moe.py +755 -0
  1091. vllm/model_executor/models/ernie45_vl.py +1710 -0
  1092. vllm/model_executor/models/ernie45_vl_moe.py +800 -0
  1093. vllm/model_executor/models/ernie_mtp.py +279 -0
  1094. vllm/model_executor/models/exaone.py +525 -0
  1095. vllm/model_executor/models/exaone4.py +517 -0
  1096. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1097. vllm/model_executor/models/falcon.py +544 -0
  1098. vllm/model_executor/models/falcon_h1.py +680 -0
  1099. vllm/model_executor/models/flex_olmo.py +155 -0
  1100. vllm/model_executor/models/fuyu.py +373 -0
  1101. vllm/model_executor/models/gemma.py +426 -0
  1102. vllm/model_executor/models/gemma2.py +436 -0
  1103. vllm/model_executor/models/gemma3.py +577 -0
  1104. vllm/model_executor/models/gemma3_mm.py +665 -0
  1105. vllm/model_executor/models/gemma3n.py +1167 -0
  1106. vllm/model_executor/models/gemma3n_mm.py +811 -0
  1107. vllm/model_executor/models/glm.py +23 -0
  1108. vllm/model_executor/models/glm4.py +298 -0
  1109. vllm/model_executor/models/glm4_1v.py +1854 -0
  1110. vllm/model_executor/models/glm4_moe.py +738 -0
  1111. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1112. vllm/model_executor/models/glm4v.py +785 -0
  1113. vllm/model_executor/models/gpt2.py +397 -0
  1114. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1115. vllm/model_executor/models/gpt_j.py +345 -0
  1116. vllm/model_executor/models/gpt_neox.py +343 -0
  1117. vllm/model_executor/models/gpt_oss.py +745 -0
  1118. vllm/model_executor/models/granite.py +476 -0
  1119. vllm/model_executor/models/granite_speech.py +913 -0
  1120. vllm/model_executor/models/granitemoe.py +561 -0
  1121. vllm/model_executor/models/granitemoehybrid.py +704 -0
  1122. vllm/model_executor/models/granitemoeshared.py +328 -0
  1123. vllm/model_executor/models/gritlm.py +245 -0
  1124. vllm/model_executor/models/grok1.py +555 -0
  1125. vllm/model_executor/models/h2ovl.py +554 -0
  1126. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1127. vllm/model_executor/models/hunyuan_vision.py +1028 -0
  1128. vllm/model_executor/models/hyperclovax_vision.py +1166 -0
  1129. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1130. vllm/model_executor/models/idefics3.py +718 -0
  1131. vllm/model_executor/models/interfaces.py +1148 -0
  1132. vllm/model_executor/models/interfaces_base.py +243 -0
  1133. vllm/model_executor/models/intern_vit.py +454 -0
  1134. vllm/model_executor/models/internlm2.py +454 -0
  1135. vllm/model_executor/models/internlm2_ve.py +139 -0
  1136. vllm/model_executor/models/interns1.py +830 -0
  1137. vllm/model_executor/models/interns1_vit.py +433 -0
  1138. vllm/model_executor/models/internvl.py +1452 -0
  1139. vllm/model_executor/models/jais.py +397 -0
  1140. vllm/model_executor/models/jamba.py +609 -0
  1141. vllm/model_executor/models/jina_vl.py +147 -0
  1142. vllm/model_executor/models/keye.py +1765 -0
  1143. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1144. vllm/model_executor/models/kimi_linear.py +658 -0
  1145. vllm/model_executor/models/kimi_vl.py +578 -0
  1146. vllm/model_executor/models/lfm2.py +516 -0
  1147. vllm/model_executor/models/lfm2_moe.py +746 -0
  1148. vllm/model_executor/models/lightonocr.py +195 -0
  1149. vllm/model_executor/models/llama.py +704 -0
  1150. vllm/model_executor/models/llama4.py +857 -0
  1151. vllm/model_executor/models/llama4_eagle.py +216 -0
  1152. vllm/model_executor/models/llama_eagle.py +213 -0
  1153. vllm/model_executor/models/llama_eagle3.py +375 -0
  1154. vllm/model_executor/models/llava.py +842 -0
  1155. vllm/model_executor/models/llava_next.py +583 -0
  1156. vllm/model_executor/models/llava_next_video.py +467 -0
  1157. vllm/model_executor/models/llava_onevision.py +923 -0
  1158. vllm/model_executor/models/longcat_flash.py +743 -0
  1159. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1160. vllm/model_executor/models/mamba.py +276 -0
  1161. vllm/model_executor/models/mamba2.py +288 -0
  1162. vllm/model_executor/models/medusa.py +179 -0
  1163. vllm/model_executor/models/midashenglm.py +828 -0
  1164. vllm/model_executor/models/mimo.py +188 -0
  1165. vllm/model_executor/models/mimo_mtp.py +294 -0
  1166. vllm/model_executor/models/minicpm.py +657 -0
  1167. vllm/model_executor/models/minicpm3.py +234 -0
  1168. vllm/model_executor/models/minicpm_eagle.py +385 -0
  1169. vllm/model_executor/models/minicpmo.py +768 -0
  1170. vllm/model_executor/models/minicpmv.py +1744 -0
  1171. vllm/model_executor/models/minimax_m2.py +546 -0
  1172. vllm/model_executor/models/minimax_text_01.py +1010 -0
  1173. vllm/model_executor/models/minimax_vl_01.py +396 -0
  1174. vllm/model_executor/models/mistral3.py +637 -0
  1175. vllm/model_executor/models/mistral_large_3.py +63 -0
  1176. vllm/model_executor/models/mistral_large_3_eagle.py +165 -0
  1177. vllm/model_executor/models/mixtral.py +599 -0
  1178. vllm/model_executor/models/mllama4.py +1151 -0
  1179. vllm/model_executor/models/mlp_speculator.py +235 -0
  1180. vllm/model_executor/models/modernbert.py +452 -0
  1181. vllm/model_executor/models/module_mapping.py +74 -0
  1182. vllm/model_executor/models/molmo.py +1553 -0
  1183. vllm/model_executor/models/moonvit.py +686 -0
  1184. vllm/model_executor/models/mpt.py +335 -0
  1185. vllm/model_executor/models/nano_nemotron_vl.py +1732 -0
  1186. vllm/model_executor/models/nemotron.py +502 -0
  1187. vllm/model_executor/models/nemotron_h.py +850 -0
  1188. vllm/model_executor/models/nemotron_nas.py +473 -0
  1189. vllm/model_executor/models/nemotron_vl.py +653 -0
  1190. vllm/model_executor/models/nvlm_d.py +216 -0
  1191. vllm/model_executor/models/olmo.py +413 -0
  1192. vllm/model_executor/models/olmo2.py +455 -0
  1193. vllm/model_executor/models/olmoe.py +494 -0
  1194. vllm/model_executor/models/opencua.py +271 -0
  1195. vllm/model_executor/models/openpangu.py +1051 -0
  1196. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1197. vllm/model_executor/models/opt.py +426 -0
  1198. vllm/model_executor/models/orion.py +366 -0
  1199. vllm/model_executor/models/ouro.py +508 -0
  1200. vllm/model_executor/models/ovis.py +559 -0
  1201. vllm/model_executor/models/ovis2_5.py +673 -0
  1202. vllm/model_executor/models/paddleocr_vl.py +1380 -0
  1203. vllm/model_executor/models/paligemma.py +412 -0
  1204. vllm/model_executor/models/persimmon.py +376 -0
  1205. vllm/model_executor/models/phi.py +370 -0
  1206. vllm/model_executor/models/phi3.py +18 -0
  1207. vllm/model_executor/models/phi3v.py +737 -0
  1208. vllm/model_executor/models/phi4_multimodal.py +1447 -0
  1209. vllm/model_executor/models/phi4mm.py +1253 -0
  1210. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1211. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1212. vllm/model_executor/models/phimoe.py +670 -0
  1213. vllm/model_executor/models/pixtral.py +1380 -0
  1214. vllm/model_executor/models/plamo2.py +966 -0
  1215. vllm/model_executor/models/plamo3.py +441 -0
  1216. vllm/model_executor/models/qwen.py +363 -0
  1217. vllm/model_executor/models/qwen2.py +569 -0
  1218. vllm/model_executor/models/qwen2_5_omni_thinker.py +1220 -0
  1219. vllm/model_executor/models/qwen2_5_vl.py +1594 -0
  1220. vllm/model_executor/models/qwen2_audio.py +473 -0
  1221. vllm/model_executor/models/qwen2_moe.py +590 -0
  1222. vllm/model_executor/models/qwen2_rm.py +123 -0
  1223. vllm/model_executor/models/qwen2_vl.py +1593 -0
  1224. vllm/model_executor/models/qwen3.py +332 -0
  1225. vllm/model_executor/models/qwen3_moe.py +738 -0
  1226. vllm/model_executor/models/qwen3_next.py +1390 -0
  1227. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1228. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1765 -0
  1229. vllm/model_executor/models/qwen3_vl.py +1686 -0
  1230. vllm/model_executor/models/qwen3_vl_moe.py +470 -0
  1231. vllm/model_executor/models/qwen_vl.py +803 -0
  1232. vllm/model_executor/models/radio.py +555 -0
  1233. vllm/model_executor/models/registry.py +1183 -0
  1234. vllm/model_executor/models/roberta.py +259 -0
  1235. vllm/model_executor/models/rvl.py +107 -0
  1236. vllm/model_executor/models/seed_oss.py +493 -0
  1237. vllm/model_executor/models/siglip.py +1245 -0
  1238. vllm/model_executor/models/siglip2navit.py +723 -0
  1239. vllm/model_executor/models/skyworkr1v.py +953 -0
  1240. vllm/model_executor/models/smolvlm.py +38 -0
  1241. vllm/model_executor/models/solar.py +485 -0
  1242. vllm/model_executor/models/stablelm.py +359 -0
  1243. vllm/model_executor/models/starcoder2.py +366 -0
  1244. vllm/model_executor/models/step3_text.py +555 -0
  1245. vllm/model_executor/models/step3_vl.py +1149 -0
  1246. vllm/model_executor/models/swin.py +514 -0
  1247. vllm/model_executor/models/tarsier.py +619 -0
  1248. vllm/model_executor/models/telechat2.py +153 -0
  1249. vllm/model_executor/models/teleflm.py +78 -0
  1250. vllm/model_executor/models/terratorch.py +319 -0
  1251. vllm/model_executor/models/transformers/__init__.py +127 -0
  1252. vllm/model_executor/models/transformers/base.py +464 -0
  1253. vllm/model_executor/models/transformers/causal.py +65 -0
  1254. vllm/model_executor/models/transformers/legacy.py +90 -0
  1255. vllm/model_executor/models/transformers/moe.py +325 -0
  1256. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1257. vllm/model_executor/models/transformers/pooling.py +119 -0
  1258. vllm/model_executor/models/transformers/utils.py +213 -0
  1259. vllm/model_executor/models/ultravox.py +686 -0
  1260. vllm/model_executor/models/utils.py +832 -0
  1261. vllm/model_executor/models/vision.py +552 -0
  1262. vllm/model_executor/models/voxtral.py +842 -0
  1263. vllm/model_executor/models/whisper.py +963 -0
  1264. vllm/model_executor/models/zamba2.py +980 -0
  1265. vllm/model_executor/parameter.py +642 -0
  1266. vllm/model_executor/utils.py +94 -0
  1267. vllm/model_executor/warmup/__init__.py +0 -0
  1268. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1269. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1270. vllm/multimodal/__init__.py +40 -0
  1271. vllm/multimodal/audio.py +142 -0
  1272. vllm/multimodal/base.py +26 -0
  1273. vllm/multimodal/cache.py +830 -0
  1274. vllm/multimodal/evs.py +294 -0
  1275. vllm/multimodal/hasher.py +106 -0
  1276. vllm/multimodal/image.py +130 -0
  1277. vllm/multimodal/inputs.py +1036 -0
  1278. vllm/multimodal/parse.py +544 -0
  1279. vllm/multimodal/processing.py +2240 -0
  1280. vllm/multimodal/profiling.py +369 -0
  1281. vllm/multimodal/registry.py +357 -0
  1282. vllm/multimodal/utils.py +523 -0
  1283. vllm/multimodal/video.py +333 -0
  1284. vllm/outputs.py +345 -0
  1285. vllm/platforms/__init__.py +277 -0
  1286. vllm/platforms/cpu.py +410 -0
  1287. vllm/platforms/cuda.py +642 -0
  1288. vllm/platforms/interface.py +656 -0
  1289. vllm/platforms/rocm.py +513 -0
  1290. vllm/platforms/tpu.py +275 -0
  1291. vllm/platforms/xpu.py +261 -0
  1292. vllm/plugins/__init__.py +81 -0
  1293. vllm/plugins/io_processors/__init__.py +68 -0
  1294. vllm/plugins/io_processors/interface.py +77 -0
  1295. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1296. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1297. vllm/pooling_params.py +230 -0
  1298. vllm/profiler/__init__.py +0 -0
  1299. vllm/profiler/gpu_profiler.py +216 -0
  1300. vllm/profiler/layerwise_profile.py +392 -0
  1301. vllm/profiler/utils.py +151 -0
  1302. vllm/py.typed +2 -0
  1303. vllm/ray/__init__.py +0 -0
  1304. vllm/ray/lazy_utils.py +30 -0
  1305. vllm/ray/ray_env.py +79 -0
  1306. vllm/reasoning/__init__.py +92 -0
  1307. vllm/reasoning/abs_reasoning_parsers.py +290 -0
  1308. vllm/reasoning/basic_parsers.py +162 -0
  1309. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1310. vllm/reasoning/deepseek_v3_reasoning_parser.py +62 -0
  1311. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1312. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1313. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1314. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1315. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1316. vllm/reasoning/identity_reasoning_parser.py +58 -0
  1317. vllm/reasoning/minimax_m2_reasoning_parser.py +67 -0
  1318. vllm/reasoning/mistral_reasoning_parser.py +55 -0
  1319. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1320. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1321. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1322. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1323. vllm/sampling_params.py +597 -0
  1324. vllm/scalar_type.py +355 -0
  1325. vllm/scripts.py +17 -0
  1326. vllm/sequence.py +98 -0
  1327. vllm/tasks.py +13 -0
  1328. vllm/third_party/__init__.py +0 -0
  1329. vllm/third_party/pynvml.py +6140 -0
  1330. vllm/tokenizers/__init__.py +24 -0
  1331. vllm/tokenizers/detokenizer_utils.py +198 -0
  1332. vllm/tokenizers/hf.py +124 -0
  1333. vllm/tokenizers/mistral.py +554 -0
  1334. vllm/tokenizers/protocol.py +111 -0
  1335. vllm/tokenizers/registry.py +233 -0
  1336. vllm/tracing.py +135 -0
  1337. vllm/transformers_utils/__init__.py +26 -0
  1338. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1339. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1340. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1341. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1342. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1343. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1344. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1345. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1346. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1347. vllm/transformers_utils/config.py +1081 -0
  1348. vllm/transformers_utils/config_parser_base.py +20 -0
  1349. vllm/transformers_utils/configs/__init__.py +84 -0
  1350. vllm/transformers_utils/configs/afmoe.py +87 -0
  1351. vllm/transformers_utils/configs/arctic.py +216 -0
  1352. vllm/transformers_utils/configs/chatglm.py +75 -0
  1353. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1354. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1355. vllm/transformers_utils/configs/eagle.py +90 -0
  1356. vllm/transformers_utils/configs/falcon.py +89 -0
  1357. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1358. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1359. vllm/transformers_utils/configs/jais.py +243 -0
  1360. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1361. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1362. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1363. vllm/transformers_utils/configs/medusa.py +65 -0
  1364. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1365. vllm/transformers_utils/configs/mistral.py +235 -0
  1366. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1367. vllm/transformers_utils/configs/moonvit.py +33 -0
  1368. vllm/transformers_utils/configs/nemotron.py +214 -0
  1369. vllm/transformers_utils/configs/nemotron_h.py +282 -0
  1370. vllm/transformers_utils/configs/olmo3.py +83 -0
  1371. vllm/transformers_utils/configs/ovis.py +182 -0
  1372. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1373. vllm/transformers_utils/configs/radio.py +89 -0
  1374. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1375. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1376. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1377. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1378. vllm/transformers_utils/configs/ultravox.py +118 -0
  1379. vllm/transformers_utils/dynamic_module.py +59 -0
  1380. vllm/transformers_utils/gguf_utils.py +209 -0
  1381. vllm/transformers_utils/processor.py +423 -0
  1382. vllm/transformers_utils/processors/__init__.py +23 -0
  1383. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1384. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1385. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1386. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1387. vllm/transformers_utils/processors/ovis.py +453 -0
  1388. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1389. vllm/transformers_utils/repo_utils.py +287 -0
  1390. vllm/transformers_utils/runai_utils.py +104 -0
  1391. vllm/transformers_utils/s3_utils.py +95 -0
  1392. vllm/transformers_utils/tokenizer.py +127 -0
  1393. vllm/transformers_utils/tokenizer_base.py +33 -0
  1394. vllm/transformers_utils/utils.py +184 -0
  1395. vllm/triton_utils/__init__.py +20 -0
  1396. vllm/triton_utils/importing.py +103 -0
  1397. vllm/usage/__init__.py +0 -0
  1398. vllm/usage/usage_lib.py +294 -0
  1399. vllm/utils/__init__.py +66 -0
  1400. vllm/utils/argparse_utils.py +504 -0
  1401. vllm/utils/async_utils.py +310 -0
  1402. vllm/utils/cache.py +214 -0
  1403. vllm/utils/collection_utils.py +112 -0
  1404. vllm/utils/counter.py +45 -0
  1405. vllm/utils/deep_gemm.py +399 -0
  1406. vllm/utils/flashinfer.py +532 -0
  1407. vllm/utils/func_utils.py +236 -0
  1408. vllm/utils/gc_utils.py +151 -0
  1409. vllm/utils/hashing.py +81 -0
  1410. vllm/utils/import_utils.py +449 -0
  1411. vllm/utils/jsontree.py +158 -0
  1412. vllm/utils/math_utils.py +32 -0
  1413. vllm/utils/mem_constants.py +13 -0
  1414. vllm/utils/mem_utils.py +232 -0
  1415. vllm/utils/nccl.py +64 -0
  1416. vllm/utils/network_utils.py +331 -0
  1417. vllm/utils/platform_utils.py +59 -0
  1418. vllm/utils/profiling.py +56 -0
  1419. vllm/utils/registry.py +51 -0
  1420. vllm/utils/serial_utils.py +169 -0
  1421. vllm/utils/system_utils.py +265 -0
  1422. vllm/utils/tensor_schema.py +255 -0
  1423. vllm/utils/torch_utils.py +647 -0
  1424. vllm/v1/__init__.py +0 -0
  1425. vllm/v1/attention/__init__.py +0 -0
  1426. vllm/v1/attention/backends/__init__.py +0 -0
  1427. vllm/v1/attention/backends/cpu_attn.py +497 -0
  1428. vllm/v1/attention/backends/flash_attn.py +1050 -0
  1429. vllm/v1/attention/backends/flashinfer.py +1572 -0
  1430. vllm/v1/attention/backends/flex_attention.py +945 -0
  1431. vllm/v1/attention/backends/gdn_attn.py +387 -0
  1432. vllm/v1/attention/backends/linear_attn.py +77 -0
  1433. vllm/v1/attention/backends/mamba1_attn.py +165 -0
  1434. vllm/v1/attention/backends/mamba2_attn.py +354 -0
  1435. vllm/v1/attention/backends/mamba_attn.py +117 -0
  1436. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1437. vllm/v1/attention/backends/mla/aiter_triton_mla.py +74 -0
  1438. vllm/v1/attention/backends/mla/common.py +2069 -0
  1439. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1440. vllm/v1/attention/backends/mla/flashattn_mla.py +340 -0
  1441. vllm/v1/attention/backends/mla/flashinfer_mla.py +174 -0
  1442. vllm/v1/attention/backends/mla/flashmla.py +317 -0
  1443. vllm/v1/attention/backends/mla/flashmla_sparse.py +551 -0
  1444. vllm/v1/attention/backends/mla/indexer.py +369 -0
  1445. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +275 -0
  1446. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +325 -0
  1447. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1448. vllm/v1/attention/backends/pallas.py +436 -0
  1449. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1450. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1451. vllm/v1/attention/backends/rocm_attn.py +359 -0
  1452. vllm/v1/attention/backends/short_conv_attn.py +105 -0
  1453. vllm/v1/attention/backends/tree_attn.py +428 -0
  1454. vllm/v1/attention/backends/triton_attn.py +377 -0
  1455. vllm/v1/attention/backends/utils.py +1149 -0
  1456. vllm/v1/core/__init__.py +0 -0
  1457. vllm/v1/core/block_pool.py +466 -0
  1458. vllm/v1/core/encoder_cache_manager.py +343 -0
  1459. vllm/v1/core/kv_cache_coordinator.py +570 -0
  1460. vllm/v1/core/kv_cache_manager.py +408 -0
  1461. vllm/v1/core/kv_cache_metrics.py +96 -0
  1462. vllm/v1/core/kv_cache_utils.py +1471 -0
  1463. vllm/v1/core/sched/__init__.py +0 -0
  1464. vllm/v1/core/sched/async_scheduler.py +68 -0
  1465. vllm/v1/core/sched/interface.py +187 -0
  1466. vllm/v1/core/sched/output.py +230 -0
  1467. vllm/v1/core/sched/request_queue.py +217 -0
  1468. vllm/v1/core/sched/scheduler.py +1726 -0
  1469. vllm/v1/core/sched/utils.py +72 -0
  1470. vllm/v1/core/single_type_kv_cache_manager.py +801 -0
  1471. vllm/v1/cudagraph_dispatcher.py +183 -0
  1472. vllm/v1/engine/__init__.py +214 -0
  1473. vllm/v1/engine/async_llm.py +874 -0
  1474. vllm/v1/engine/coordinator.py +377 -0
  1475. vllm/v1/engine/core.py +1421 -0
  1476. vllm/v1/engine/core_client.py +1406 -0
  1477. vllm/v1/engine/detokenizer.py +351 -0
  1478. vllm/v1/engine/exceptions.py +18 -0
  1479. vllm/v1/engine/input_processor.py +636 -0
  1480. vllm/v1/engine/llm_engine.py +416 -0
  1481. vllm/v1/engine/logprobs.py +189 -0
  1482. vllm/v1/engine/output_processor.py +658 -0
  1483. vllm/v1/engine/parallel_sampling.py +145 -0
  1484. vllm/v1/engine/processor.py +20 -0
  1485. vllm/v1/engine/utils.py +1068 -0
  1486. vllm/v1/executor/__init__.py +6 -0
  1487. vllm/v1/executor/abstract.py +352 -0
  1488. vllm/v1/executor/multiproc_executor.py +888 -0
  1489. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1490. vllm/v1/executor/ray_executor.py +626 -0
  1491. vllm/v1/executor/ray_utils.py +465 -0
  1492. vllm/v1/executor/uniproc_executor.py +183 -0
  1493. vllm/v1/kv_cache_interface.py +404 -0
  1494. vllm/v1/kv_offload/__init__.py +0 -0
  1495. vllm/v1/kv_offload/abstract.py +161 -0
  1496. vllm/v1/kv_offload/arc_manager.py +237 -0
  1497. vllm/v1/kv_offload/backend.py +97 -0
  1498. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1499. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1500. vllm/v1/kv_offload/cpu.py +86 -0
  1501. vllm/v1/kv_offload/factory.py +56 -0
  1502. vllm/v1/kv_offload/lru_manager.py +139 -0
  1503. vllm/v1/kv_offload/mediums.py +39 -0
  1504. vllm/v1/kv_offload/spec.py +66 -0
  1505. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1506. vllm/v1/kv_offload/worker/cpu_gpu.py +191 -0
  1507. vllm/v1/kv_offload/worker/worker.py +144 -0
  1508. vllm/v1/metrics/__init__.py +0 -0
  1509. vllm/v1/metrics/loggers.py +1268 -0
  1510. vllm/v1/metrics/prometheus.py +82 -0
  1511. vllm/v1/metrics/ray_wrappers.py +194 -0
  1512. vllm/v1/metrics/reader.py +257 -0
  1513. vllm/v1/metrics/stats.py +431 -0
  1514. vllm/v1/outputs.py +237 -0
  1515. vllm/v1/pool/__init__.py +0 -0
  1516. vllm/v1/pool/metadata.py +82 -0
  1517. vllm/v1/request.py +280 -0
  1518. vllm/v1/sample/__init__.py +0 -0
  1519. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1520. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1521. vllm/v1/sample/logits_processor/interface.py +106 -0
  1522. vllm/v1/sample/logits_processor/state.py +165 -0
  1523. vllm/v1/sample/metadata.py +44 -0
  1524. vllm/v1/sample/ops/__init__.py +0 -0
  1525. vllm/v1/sample/ops/bad_words.py +52 -0
  1526. vllm/v1/sample/ops/logprobs.py +25 -0
  1527. vllm/v1/sample/ops/penalties.py +57 -0
  1528. vllm/v1/sample/ops/topk_topp_sampler.py +384 -0
  1529. vllm/v1/sample/rejection_sampler.py +805 -0
  1530. vllm/v1/sample/sampler.py +319 -0
  1531. vllm/v1/sample/tpu/__init__.py +0 -0
  1532. vllm/v1/sample/tpu/metadata.py +120 -0
  1533. vllm/v1/sample/tpu/sampler.py +215 -0
  1534. vllm/v1/serial_utils.py +532 -0
  1535. vllm/v1/spec_decode/__init__.py +0 -0
  1536. vllm/v1/spec_decode/eagle.py +1325 -0
  1537. vllm/v1/spec_decode/medusa.py +73 -0
  1538. vllm/v1/spec_decode/metadata.py +66 -0
  1539. vllm/v1/spec_decode/metrics.py +225 -0
  1540. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1541. vllm/v1/spec_decode/suffix_decoding.py +101 -0
  1542. vllm/v1/spec_decode/utils.py +121 -0
  1543. vllm/v1/structured_output/__init__.py +338 -0
  1544. vllm/v1/structured_output/backend_guidance.py +265 -0
  1545. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1546. vllm/v1/structured_output/backend_outlines.py +324 -0
  1547. vllm/v1/structured_output/backend_types.py +136 -0
  1548. vllm/v1/structured_output/backend_xgrammar.py +362 -0
  1549. vllm/v1/structured_output/request.py +94 -0
  1550. vllm/v1/structured_output/utils.py +469 -0
  1551. vllm/v1/utils.py +414 -0
  1552. vllm/v1/worker/__init__.py +0 -0
  1553. vllm/v1/worker/block_table.py +343 -0
  1554. vllm/v1/worker/cpu_model_runner.py +122 -0
  1555. vllm/v1/worker/cpu_worker.py +210 -0
  1556. vllm/v1/worker/dp_utils.py +250 -0
  1557. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1558. vllm/v1/worker/gpu/README.md +4 -0
  1559. vllm/v1/worker/gpu/__init__.py +0 -0
  1560. vllm/v1/worker/gpu/async_utils.py +97 -0
  1561. vllm/v1/worker/gpu/attn_utils.py +189 -0
  1562. vllm/v1/worker/gpu/block_table.py +314 -0
  1563. vllm/v1/worker/gpu/cudagraph_utils.py +259 -0
  1564. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1565. vllm/v1/worker/gpu/input_batch.py +430 -0
  1566. vllm/v1/worker/gpu/model_runner.py +1007 -0
  1567. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1568. vllm/v1/worker/gpu/sample/gumbel.py +101 -0
  1569. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1570. vllm/v1/worker/gpu/sample/metadata.py +179 -0
  1571. vllm/v1/worker/gpu/sample/penalties.py +154 -0
  1572. vllm/v1/worker/gpu/sample/sampler.py +75 -0
  1573. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1574. vllm/v1/worker/gpu/spec_decode/eagle.py +565 -0
  1575. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1576. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +83 -0
  1577. vllm/v1/worker/gpu/states.py +309 -0
  1578. vllm/v1/worker/gpu/structured_outputs.py +76 -0
  1579. vllm/v1/worker/gpu_input_batch.py +971 -0
  1580. vllm/v1/worker/gpu_model_runner.py +5360 -0
  1581. vllm/v1/worker/gpu_ubatch_wrapper.py +472 -0
  1582. vllm/v1/worker/gpu_worker.py +922 -0
  1583. vllm/v1/worker/kv_connector_model_runner_mixin.py +309 -0
  1584. vllm/v1/worker/lora_model_runner_mixin.py +212 -0
  1585. vllm/v1/worker/tpu_input_batch.py +583 -0
  1586. vllm/v1/worker/tpu_model_runner.py +2196 -0
  1587. vllm/v1/worker/tpu_worker.py +351 -0
  1588. vllm/v1/worker/ubatch_utils.py +73 -0
  1589. vllm/v1/worker/ubatching.py +231 -0
  1590. vllm/v1/worker/utils.py +365 -0
  1591. vllm/v1/worker/worker_base.py +377 -0
  1592. vllm/v1/worker/xpu_model_runner.py +48 -0
  1593. vllm/v1/worker/xpu_worker.py +198 -0
  1594. vllm/version.py +39 -0
  1595. vllm/vllm_flash_attn/.gitkeep +0 -0
  1596. vllm_cpu-0.12.0.dist-info/METADATA +300 -0
  1597. vllm_cpu-0.12.0.dist-info/RECORD +1600 -0
  1598. vllm_cpu-0.12.0.dist-info/WHEEL +5 -0
  1599. vllm_cpu-0.12.0.dist-info/entry_points.txt +5 -0
  1600. vllm_cpu-0.12.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2465 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Adapted from
5
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
6
+ import json
7
+ import time
8
+ from http import HTTPStatus
9
+ from typing import Annotated, Any, ClassVar, Literal, TypeAlias
10
+
11
+ import regex as re
12
+ import torch
13
+ from fastapi import HTTPException, UploadFile
14
+ from openai.types.chat.chat_completion_audio import (
15
+ ChatCompletionAudio as OpenAIChatCompletionAudio,
16
+ )
17
+ from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
18
+ from openai.types.responses import (
19
+ ResponseCodeInterpreterCallCodeDeltaEvent,
20
+ ResponseCodeInterpreterCallCodeDoneEvent,
21
+ ResponseCodeInterpreterCallCompletedEvent,
22
+ ResponseCodeInterpreterCallInProgressEvent,
23
+ ResponseCodeInterpreterCallInterpretingEvent,
24
+ ResponseContentPartAddedEvent,
25
+ ResponseContentPartDoneEvent,
26
+ ResponseFunctionToolCall,
27
+ ResponseInputItemParam,
28
+ ResponseOutputItem,
29
+ ResponseOutputItemAddedEvent,
30
+ ResponseOutputItemDoneEvent,
31
+ ResponsePrompt,
32
+ ResponseReasoningTextDeltaEvent,
33
+ ResponseReasoningTextDoneEvent,
34
+ ResponseStatus,
35
+ ResponseWebSearchCallCompletedEvent,
36
+ ResponseWebSearchCallInProgressEvent,
37
+ ResponseWebSearchCallSearchingEvent,
38
+ )
39
+ from openai.types.responses import (
40
+ ResponseCompletedEvent as OpenAIResponseCompletedEvent,
41
+ )
42
+ from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreatedEvent
43
+ from openai.types.responses import (
44
+ ResponseInProgressEvent as OpenAIResponseInProgressEvent,
45
+ )
46
+ from openai.types.responses.response_reasoning_item import (
47
+ Content as ResponseReasoningTextContent,
48
+ )
49
+ from openai_harmony import Message as OpenAIHarmonyMessage
50
+
51
+ # Backward compatibility for OpenAI client versions
52
+ try: # For older openai versions (< 1.100.0)
53
+ from openai.types.responses import ResponseTextConfig
54
+ except ImportError: # For newer openai versions (>= 1.100.0)
55
+ from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig
56
+
57
+
58
+ from openai.types.responses.response import IncompleteDetails, ToolChoice
59
+ from openai.types.responses.tool import Tool
60
+ from openai.types.shared import Metadata, Reasoning
61
+ from pydantic import (
62
+ BaseModel,
63
+ ConfigDict,
64
+ Field,
65
+ ValidationError,
66
+ field_serializer,
67
+ model_validator,
68
+ )
69
+
70
+ from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
71
+ from vllm.logger import init_logger
72
+ from vllm.logprobs import Logprob
73
+ from vllm.sampling_params import (
74
+ BeamSearchParams,
75
+ RequestOutputKind,
76
+ SamplingParams,
77
+ StructuredOutputsParams,
78
+ )
79
+ from vllm.utils import random_uuid
80
+ from vllm.utils.import_utils import resolve_obj_by_qualname
81
+
82
+ logger = init_logger(__name__)
83
+
84
+ _LONG_INFO = torch.iinfo(torch.long)
85
+
86
+
87
+ class OpenAIBaseModel(BaseModel):
88
+ # OpenAI API does allow extra fields
89
+ model_config = ConfigDict(extra="allow")
90
+
91
+ # Cache class field names
92
+ field_names: ClassVar[set[str] | None] = None
93
+
94
+ @model_validator(mode="wrap")
95
+ @classmethod
96
+ def __log_extra_fields__(cls, data, handler):
97
+ result = handler(data)
98
+ if not isinstance(data, dict):
99
+ return result
100
+ field_names = cls.field_names
101
+ if field_names is None:
102
+ # Get all class field names and their potential aliases
103
+ field_names = set()
104
+ for field_name, field in cls.model_fields.items():
105
+ field_names.add(field_name)
106
+ if alias := getattr(field, "alias", None):
107
+ field_names.add(alias)
108
+ cls.field_names = field_names
109
+
110
+ # Compare against both field names and aliases
111
+ if any(k not in field_names for k in data):
112
+ logger.warning(
113
+ "The following fields were present in the request but ignored: %s",
114
+ data.keys() - field_names,
115
+ )
116
+ return result
117
+
118
+
119
+ class ErrorInfo(OpenAIBaseModel):
120
+ message: str
121
+ type: str
122
+ param: str | None = None
123
+ code: int
124
+
125
+
126
+ class ErrorResponse(OpenAIBaseModel):
127
+ error: ErrorInfo
128
+
129
+
130
+ class ModelPermission(OpenAIBaseModel):
131
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
132
+ object: str = "model_permission"
133
+ created: int = Field(default_factory=lambda: int(time.time()))
134
+ allow_create_engine: bool = False
135
+ allow_sampling: bool = True
136
+ allow_logprobs: bool = True
137
+ allow_search_indices: bool = False
138
+ allow_view: bool = True
139
+ allow_fine_tuning: bool = False
140
+ organization: str = "*"
141
+ group: str | None = None
142
+ is_blocking: bool = False
143
+
144
+
145
+ class ModelCard(OpenAIBaseModel):
146
+ id: str
147
+ object: str = "model"
148
+ created: int = Field(default_factory=lambda: int(time.time()))
149
+ owned_by: str = "vllm"
150
+ root: str | None = None
151
+ parent: str | None = None
152
+ max_model_len: int | None = None
153
+ permission: list[ModelPermission] = Field(default_factory=list)
154
+
155
+
156
+ class ModelList(OpenAIBaseModel):
157
+ object: str = "list"
158
+ data: list[ModelCard] = Field(default_factory=list)
159
+
160
+
161
+ class PromptTokenUsageInfo(OpenAIBaseModel):
162
+ cached_tokens: int | None = None
163
+
164
+
165
+ class UsageInfo(OpenAIBaseModel):
166
+ prompt_tokens: int = 0
167
+ total_tokens: int = 0
168
+ completion_tokens: int | None = 0
169
+ prompt_tokens_details: PromptTokenUsageInfo | None = None
170
+
171
+
172
+ class RequestResponseMetadata(BaseModel):
173
+ request_id: str
174
+ final_usage_info: UsageInfo | None = None
175
+
176
+
177
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
178
+ name: str
179
+ description: str | None = None
180
+ # schema is the field in openai but that causes conflicts with pydantic so
181
+ # instead use json_schema with an alias
182
+ json_schema: dict[str, Any] | None = Field(default=None, alias="schema")
183
+ strict: bool | None = None
184
+
185
+
186
+ class LegacyStructuralTag(OpenAIBaseModel):
187
+ begin: str
188
+ # schema is the field, but that causes conflicts with pydantic so
189
+ # instead use structural_tag_schema with an alias
190
+ structural_tag_schema: dict[str, Any] | None = Field(default=None, alias="schema")
191
+ end: str
192
+
193
+
194
+ class LegacyStructuralTagResponseFormat(OpenAIBaseModel):
195
+ type: Literal["structural_tag"]
196
+ structures: list[LegacyStructuralTag]
197
+ triggers: list[str]
198
+
199
+
200
+ class StructuralTagResponseFormat(OpenAIBaseModel):
201
+ type: Literal["structural_tag"]
202
+ format: Any
203
+
204
+
205
+ AnyStructuralTagResponseFormat: TypeAlias = (
206
+ LegacyStructuralTagResponseFormat | StructuralTagResponseFormat
207
+ )
208
+
209
+
210
+ class ResponseFormat(OpenAIBaseModel):
211
+ # type must be "json_schema", "json_object", or "text"
212
+ type: Literal["text", "json_object", "json_schema"]
213
+ json_schema: JsonSchemaResponseFormat | None = None
214
+
215
+
216
+ AnyResponseFormat: TypeAlias = (
217
+ ResponseFormat | StructuralTagResponseFormat | LegacyStructuralTagResponseFormat
218
+ )
219
+
220
+
221
+ class StreamOptions(OpenAIBaseModel):
222
+ include_usage: bool | None = True
223
+ continuous_usage_stats: bool | None = False
224
+
225
+
226
+ class FunctionDefinition(OpenAIBaseModel):
227
+ name: str
228
+ description: str | None = None
229
+ parameters: dict[str, Any] | None = None
230
+
231
+
232
+ class ChatCompletionToolsParam(OpenAIBaseModel):
233
+ type: Literal["function"] = "function"
234
+ function: FunctionDefinition
235
+
236
+
237
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
238
+ name: str
239
+
240
+
241
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
242
+ function: ChatCompletionNamedFunction
243
+ type: Literal["function"] = "function"
244
+
245
+
246
+ # extra="forbid" is a workaround to have kwargs as a field,
247
+ # see https://github.com/pydantic/pydantic/issues/3125
248
+ class LogitsProcessorConstructor(BaseModel):
249
+ qualname: str
250
+ args: list[Any] | None = None
251
+ kwargs: dict[str, Any] | None = None
252
+
253
+ model_config = ConfigDict(extra="forbid")
254
+
255
+
256
+ LogitsProcessors = list[str | LogitsProcessorConstructor]
257
+
258
+
259
+ def get_logits_processors(
260
+ processors: LogitsProcessors | None, pattern: str | None
261
+ ) -> list[Any] | None:
262
+ if processors and pattern:
263
+ logits_processors = []
264
+ for processor in processors:
265
+ qualname = processor if isinstance(processor, str) else processor.qualname
266
+ if not re.match(pattern, qualname):
267
+ raise ValueError(
268
+ f"Logits processor '{qualname}' is not allowed by this "
269
+ "server. See --logits-processor-pattern engine argument "
270
+ "for more information."
271
+ )
272
+ try:
273
+ logits_processor = resolve_obj_by_qualname(qualname)
274
+ except Exception as e:
275
+ raise ValueError(
276
+ f"Logits processor '{qualname}' could not be resolved: {e}"
277
+ ) from e
278
+ if isinstance(processor, LogitsProcessorConstructor):
279
+ logits_processor = logits_processor(
280
+ *processor.args or [], **processor.kwargs or {}
281
+ )
282
+ logits_processors.append(logits_processor)
283
+ return logits_processors
284
+ elif processors:
285
+ raise ValueError(
286
+ "The `logits_processors` argument is not supported by this "
287
+ "server. See --logits-processor-pattern engine argument "
288
+ "for more information."
289
+ )
290
+ return None
291
+
292
+
293
+ ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem
294
+
295
+
296
+ class ResponsesRequest(OpenAIBaseModel):
297
+ # Ordered by official OpenAI API documentation
298
+ # https://platform.openai.com/docs/api-reference/responses/create
299
+ background: bool | None = False
300
+ include: (
301
+ list[
302
+ Literal[
303
+ "code_interpreter_call.outputs",
304
+ "computer_call_output.output.image_url",
305
+ "file_search_call.results",
306
+ "message.input_image.image_url",
307
+ "message.output_text.logprobs",
308
+ "reasoning.encrypted_content",
309
+ ],
310
+ ]
311
+ | None
312
+ ) = None
313
+ input: str | list[ResponseInputOutputItem]
314
+ instructions: str | None = None
315
+ max_output_tokens: int | None = None
316
+ max_tool_calls: int | None = None
317
+ metadata: Metadata | None = None
318
+ model: str | None = None
319
+ parallel_tool_calls: bool | None = True
320
+ previous_response_id: str | None = None
321
+ prompt: ResponsePrompt | None = None
322
+ reasoning: Reasoning | None = None
323
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
324
+ store: bool | None = True
325
+ stream: bool | None = False
326
+ temperature: float | None = None
327
+ text: ResponseTextConfig | None = None
328
+ tool_choice: ToolChoice = "auto"
329
+ tools: list[Tool] = Field(default_factory=list)
330
+ top_logprobs: int | None = 0
331
+ top_p: float | None = None
332
+ truncation: Literal["auto", "disabled"] | None = "disabled"
333
+ user: str | None = None
334
+
335
+ # --8<-- [start:responses-extra-params]
336
+ request_id: str = Field(
337
+ default_factory=lambda: f"resp_{random_uuid()}",
338
+ description=(
339
+ "The request_id related to this request. If the caller does "
340
+ "not set it, a random_uuid will be generated. This id is used "
341
+ "through out the inference process and return in response."
342
+ ),
343
+ )
344
+ mm_processor_kwargs: dict[str, Any] | None = Field(
345
+ default=None,
346
+ description=("Additional kwargs to pass to the HF processor."),
347
+ )
348
+ priority: int = Field(
349
+ default=0,
350
+ description=(
351
+ "The priority of the request (lower means earlier handling; "
352
+ "default: 0). Any priority other than 0 will raise an error "
353
+ "if the served model does not use priority scheduling."
354
+ ),
355
+ )
356
+ cache_salt: str | None = Field(
357
+ default=None,
358
+ description=(
359
+ "If specified, the prefix cache will be salted with the provided "
360
+ "string to prevent an attacker to guess prompts in multi-user "
361
+ "environments. The salt should be random, protected from "
362
+ "access by 3rd parties, and long enough to be "
363
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
364
+ "to 256 bit)."
365
+ ),
366
+ )
367
+
368
+ enable_response_messages: bool = Field(
369
+ default=False,
370
+ description=(
371
+ "Dictates whether or not to return messages as part of the "
372
+ "response object. Currently only supported for"
373
+ "non-background and gpt-oss only. "
374
+ ),
375
+ )
376
+ # similar to input_messages / output_messages in ResponsesResponse
377
+ # we take in previous_input_messages (ie in harmony format)
378
+ # this cannot be used in conjunction with previous_response_id
379
+ # TODO: consider supporting non harmony messages as well
380
+ previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None
381
+ # --8<-- [end:responses-extra-params]
382
+
383
+ _DEFAULT_SAMPLING_PARAMS = {
384
+ "temperature": 1.0,
385
+ "top_p": 1.0,
386
+ }
387
+
388
+ def to_sampling_params(
389
+ self,
390
+ default_max_tokens: int,
391
+ default_sampling_params: dict | None = None,
392
+ ) -> SamplingParams:
393
+ if self.max_output_tokens is None:
394
+ max_tokens = default_max_tokens
395
+ else:
396
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
397
+
398
+ default_sampling_params = default_sampling_params or {}
399
+ if (temperature := self.temperature) is None:
400
+ temperature = default_sampling_params.get(
401
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
402
+ )
403
+ if (top_p := self.top_p) is None:
404
+ top_p = default_sampling_params.get(
405
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
406
+ )
407
+ stop_token_ids = default_sampling_params.get("stop_token_ids")
408
+
409
+ # Structured output
410
+ structured_outputs = None
411
+ if self.text is not None and self.text.format is not None:
412
+ response_format = self.text.format
413
+ if (
414
+ response_format.type == "json_schema"
415
+ and response_format.schema_ is not None
416
+ ):
417
+ structured_outputs = StructuredOutputsParams(
418
+ json=response_format.schema_
419
+ )
420
+ elif response_format.type == "json_object":
421
+ raise NotImplementedError("json_object is not supported")
422
+
423
+ # TODO: add more parameters
424
+ return SamplingParams.from_optional(
425
+ temperature=temperature,
426
+ top_p=top_p,
427
+ max_tokens=max_tokens,
428
+ logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
429
+ stop_token_ids=stop_token_ids,
430
+ output_kind=(
431
+ RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
432
+ ),
433
+ structured_outputs=structured_outputs,
434
+ )
435
+
436
+ def is_include_output_logprobs(self) -> bool:
437
+ """Check if the request includes output logprobs."""
438
+ if self.include is None:
439
+ return False
440
+ return (
441
+ isinstance(self.include, list)
442
+ and "message.output_text.logprobs" in self.include
443
+ )
444
+
445
+ @model_validator(mode="before")
446
+ def validate_background(cls, data):
447
+ if not data.get("background"):
448
+ return data
449
+ if not data.get("store", True):
450
+ raise ValueError("background can only be used when `store` is true")
451
+ return data
452
+
453
+ @model_validator(mode="before")
454
+ def validate_prompt(cls, data):
455
+ if data.get("prompt") is not None:
456
+ raise ValueError("prompt template is not supported")
457
+ return data
458
+
459
+ @model_validator(mode="before")
460
+ def check_cache_salt_support(cls, data):
461
+ if data.get("cache_salt") is not None and (
462
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
463
+ ):
464
+ raise ValueError(
465
+ "Parameter 'cache_salt' must be a non-empty string if provided."
466
+ )
467
+ return data
468
+
469
+ @model_validator(mode="before")
470
+ def function_call_parsing(cls, data):
471
+ """Parse function_call dictionaries into ResponseFunctionToolCall objects.
472
+ This ensures Pydantic can properly resolve union types in the input field.
473
+ Function calls provided as dicts are converted to ResponseFunctionToolCall
474
+ objects before validation, while invalid structures are left for Pydantic
475
+ to reject with appropriate error messages.
476
+ """
477
+
478
+ input_data = data.get("input")
479
+
480
+ # Early return for None, strings, or bytes
481
+ # (strings are iterable but shouldn't be processed)
482
+ if input_data is None or isinstance(input_data, (str, bytes)):
483
+ return data
484
+
485
+ # Convert iterators (like ValidatorIterator) to list
486
+ if not isinstance(input_data, list):
487
+ try:
488
+ input_data = list(input_data)
489
+ except TypeError:
490
+ # Not iterable, leave as-is for Pydantic to handle
491
+ return data
492
+
493
+ processed_input = []
494
+ for item in input_data:
495
+ if isinstance(item, dict) and item.get("type") == "function_call":
496
+ try:
497
+ processed_input.append(ResponseFunctionToolCall(**item))
498
+ except ValidationError:
499
+ # Let Pydantic handle validation for malformed function calls
500
+ logger.debug(
501
+ "Failed to parse function_call to ResponseFunctionToolCall, "
502
+ "leaving for Pydantic validation"
503
+ )
504
+ processed_input.append(item)
505
+ else:
506
+ processed_input.append(item)
507
+
508
+ data["input"] = processed_input
509
+ return data
510
+
511
+
512
+ class ChatCompletionRequest(OpenAIBaseModel):
513
+ # Ordered by official OpenAI API documentation
514
+ # https://platform.openai.com/docs/api-reference/chat/create
515
+ messages: list[ChatCompletionMessageParam]
516
+ model: str | None = None
517
+ frequency_penalty: float | None = 0.0
518
+ logit_bias: dict[str, float] | None = None
519
+ logprobs: bool | None = False
520
+ top_logprobs: int | None = 0
521
+ max_tokens: int | None = Field(
522
+ default=None,
523
+ deprecated="max_tokens is deprecated in favor of "
524
+ "the max_completion_tokens field",
525
+ )
526
+ max_completion_tokens: int | None = None
527
+ n: int | None = 1
528
+ presence_penalty: float | None = 0.0
529
+ response_format: AnyResponseFormat | None = None
530
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
531
+ stop: str | list[str] | None = []
532
+ stream: bool | None = False
533
+ stream_options: StreamOptions | None = None
534
+ temperature: float | None = None
535
+ top_p: float | None = None
536
+ tools: list[ChatCompletionToolsParam] | None = None
537
+ tool_choice: (
538
+ Literal["none"]
539
+ | Literal["auto"]
540
+ | Literal["required"]
541
+ | ChatCompletionNamedToolChoiceParam
542
+ | None
543
+ ) = "none"
544
+ reasoning_effort: Literal["low", "medium", "high"] | None = None
545
+ include_reasoning: bool = True
546
+ parallel_tool_calls: bool | None = True
547
+
548
+ # NOTE this will be ignored by vLLM
549
+ user: str | None = None
550
+
551
+ # --8<-- [start:chat-completion-sampling-params]
552
+ use_beam_search: bool = False
553
+ top_k: int | None = None
554
+ min_p: float | None = None
555
+ repetition_penalty: float | None = None
556
+ length_penalty: float = 1.0
557
+ stop_token_ids: list[int] | None = []
558
+ include_stop_str_in_output: bool = False
559
+ ignore_eos: bool = False
560
+ min_tokens: int = 0
561
+ skip_special_tokens: bool = True
562
+ spaces_between_special_tokens: bool = True
563
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
564
+ prompt_logprobs: int | None = None
565
+ allowed_token_ids: list[int] | None = None
566
+ bad_words: list[str] = Field(default_factory=list)
567
+ # --8<-- [end:chat-completion-sampling-params]
568
+
569
+ # --8<-- [start:chat-completion-extra-params]
570
+ echo: bool = Field(
571
+ default=False,
572
+ description=(
573
+ "If true, the new message will be prepended with the last message "
574
+ "if they belong to the same role."
575
+ ),
576
+ )
577
+ add_generation_prompt: bool = Field(
578
+ default=True,
579
+ description=(
580
+ "If true, the generation prompt will be added to the chat template. "
581
+ "This is a parameter used by chat template in tokenizer config of the "
582
+ "model."
583
+ ),
584
+ )
585
+ continue_final_message: bool = Field(
586
+ default=False,
587
+ description=(
588
+ "If this is set, the chat will be formatted so that the final "
589
+ "message in the chat is open-ended, without any EOS tokens. The "
590
+ "model will continue this message rather than starting a new one. "
591
+ 'This allows you to "prefill" part of the model\'s response for it. '
592
+ "Cannot be used at the same time as `add_generation_prompt`."
593
+ ),
594
+ )
595
+ add_special_tokens: bool = Field(
596
+ default=False,
597
+ description=(
598
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
599
+ "on top of what is added by the chat template. "
600
+ "For most models, the chat template takes care of adding the "
601
+ "special tokens so this should be set to false (as is the "
602
+ "default)."
603
+ ),
604
+ )
605
+ documents: list[dict[str, str]] | None = Field(
606
+ default=None,
607
+ description=(
608
+ "A list of dicts representing documents that will be accessible to "
609
+ "the model if it is performing RAG (retrieval-augmented generation)."
610
+ " If the template does not support RAG, this argument will have no "
611
+ "effect. We recommend that each document should be a dict containing "
612
+ '"title" and "text" keys.'
613
+ ),
614
+ )
615
+ chat_template: str | None = Field(
616
+ default=None,
617
+ description=(
618
+ "A Jinja template to use for this conversion. "
619
+ "As of transformers v4.44, default chat template is no longer "
620
+ "allowed, so you must provide a chat template if the tokenizer "
621
+ "does not define one."
622
+ ),
623
+ )
624
+ chat_template_kwargs: dict[str, Any] | None = Field(
625
+ default=None,
626
+ description=(
627
+ "Additional keyword args to pass to the template renderer. "
628
+ "Will be accessible by the chat template."
629
+ ),
630
+ )
631
+ mm_processor_kwargs: dict[str, Any] | None = Field(
632
+ default=None,
633
+ description=("Additional kwargs to pass to the HF processor."),
634
+ )
635
+ structured_outputs: StructuredOutputsParams | None = Field(
636
+ default=None,
637
+ description="Additional kwargs for structured outputs",
638
+ )
639
+ priority: int = Field(
640
+ default=0,
641
+ description=(
642
+ "The priority of the request (lower means earlier handling; "
643
+ "default: 0). Any priority other than 0 will raise an error "
644
+ "if the served model does not use priority scheduling."
645
+ ),
646
+ )
647
+ request_id: str = Field(
648
+ default_factory=random_uuid,
649
+ description=(
650
+ "The request_id related to this request. If the caller does "
651
+ "not set it, a random_uuid will be generated. This id is used "
652
+ "through out the inference process and return in response."
653
+ ),
654
+ )
655
+ logits_processors: LogitsProcessors | None = Field(
656
+ default=None,
657
+ description=(
658
+ "A list of either qualified names of logits processors, or "
659
+ "constructor objects, to apply when sampling. A constructor is "
660
+ "a JSON object with a required 'qualname' field specifying the "
661
+ "qualified name of the processor class/factory, and optional "
662
+ "'args' and 'kwargs' fields containing positional and keyword "
663
+ "arguments. For example: {'qualname': "
664
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
665
+ "{'param': 'value'}}."
666
+ ),
667
+ )
668
+ return_tokens_as_token_ids: bool | None = Field(
669
+ default=None,
670
+ description=(
671
+ "If specified with 'logprobs', tokens are represented "
672
+ " as strings of the form 'token_id:{token_id}' so that tokens "
673
+ "that are not JSON-encodable can be identified."
674
+ ),
675
+ )
676
+ return_token_ids: bool | None = Field(
677
+ default=None,
678
+ description=(
679
+ "If specified, the result will include token IDs alongside the "
680
+ "generated text. In streaming mode, prompt_token_ids is included "
681
+ "only in the first chunk, and token_ids contains the delta tokens "
682
+ "for each chunk. This is useful for debugging or when you "
683
+ "need to map generated text back to input tokens."
684
+ ),
685
+ )
686
+ cache_salt: str | None = Field(
687
+ default=None,
688
+ description=(
689
+ "If specified, the prefix cache will be salted with the provided "
690
+ "string to prevent an attacker to guess prompts in multi-user "
691
+ "environments. The salt should be random, protected from "
692
+ "access by 3rd parties, and long enough to be "
693
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
694
+ "to 256 bit)."
695
+ ),
696
+ )
697
+ kv_transfer_params: dict[str, Any] | None = Field(
698
+ default=None,
699
+ description="KVTransfer parameters used for disaggregated serving.",
700
+ )
701
+
702
+ vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
703
+ default=None,
704
+ description=(
705
+ "Additional request parameters with (list of) string or "
706
+ "numeric values, used by custom extensions."
707
+ ),
708
+ )
709
+
710
+ # --8<-- [end:chat-completion-extra-params]
711
+
712
+ # Default sampling parameters for chat completion requests
713
+ _DEFAULT_SAMPLING_PARAMS: dict = {
714
+ "repetition_penalty": 1.0,
715
+ "temperature": 1.0,
716
+ "top_p": 1.0,
717
+ "top_k": 0,
718
+ "min_p": 0.0,
719
+ }
720
+
721
+ def to_beam_search_params(
722
+ self, max_tokens: int, default_sampling_params: dict
723
+ ) -> BeamSearchParams:
724
+ n = self.n if self.n is not None else 1
725
+ if (temperature := self.temperature) is None:
726
+ temperature = default_sampling_params.get(
727
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
728
+ )
729
+
730
+ return BeamSearchParams(
731
+ beam_width=n,
732
+ max_tokens=max_tokens,
733
+ ignore_eos=self.ignore_eos,
734
+ temperature=temperature,
735
+ length_penalty=self.length_penalty,
736
+ include_stop_str_in_output=self.include_stop_str_in_output,
737
+ )
738
+
739
+ def to_sampling_params(
740
+ self,
741
+ max_tokens: int,
742
+ logits_processor_pattern: str | None,
743
+ default_sampling_params: dict,
744
+ ) -> SamplingParams:
745
+ # Default parameters
746
+ if (repetition_penalty := self.repetition_penalty) is None:
747
+ repetition_penalty = default_sampling_params.get(
748
+ "repetition_penalty",
749
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
750
+ )
751
+ if (temperature := self.temperature) is None:
752
+ temperature = default_sampling_params.get(
753
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
754
+ )
755
+ if (top_p := self.top_p) is None:
756
+ top_p = default_sampling_params.get(
757
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
758
+ )
759
+ if (top_k := self.top_k) is None:
760
+ top_k = default_sampling_params.get(
761
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
762
+ )
763
+ if (min_p := self.min_p) is None:
764
+ min_p = default_sampling_params.get(
765
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
766
+ )
767
+
768
+ prompt_logprobs = self.prompt_logprobs
769
+ if prompt_logprobs is None and self.echo:
770
+ prompt_logprobs = self.top_logprobs
771
+
772
+ response_format = self.response_format
773
+ if response_format is not None:
774
+ # If structured outputs wasn't already enabled,
775
+ # we must enable it for these features to work
776
+ if self.structured_outputs is None:
777
+ self.structured_outputs = StructuredOutputsParams()
778
+
779
+ # Set structured output params for response format
780
+ if response_format.type == "json_object":
781
+ self.structured_outputs.json_object = True
782
+ elif response_format.type == "json_schema":
783
+ json_schema = response_format.json_schema
784
+ assert json_schema is not None
785
+ self.structured_outputs.json = json_schema.json_schema
786
+ elif response_format.type == "structural_tag":
787
+ structural_tag = response_format
788
+ assert structural_tag is not None and isinstance(
789
+ structural_tag,
790
+ (
791
+ LegacyStructuralTagResponseFormat,
792
+ StructuralTagResponseFormat,
793
+ ),
794
+ )
795
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
796
+ self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
797
+
798
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
799
+ if self.kv_transfer_params:
800
+ # Pass in kv_transfer_params via extra_args
801
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
802
+ return SamplingParams.from_optional(
803
+ n=self.n,
804
+ presence_penalty=self.presence_penalty,
805
+ frequency_penalty=self.frequency_penalty,
806
+ repetition_penalty=repetition_penalty,
807
+ temperature=temperature,
808
+ top_p=top_p,
809
+ top_k=top_k,
810
+ min_p=min_p,
811
+ seed=self.seed,
812
+ stop=self.stop,
813
+ stop_token_ids=self.stop_token_ids,
814
+ logprobs=self.top_logprobs if self.logprobs else None,
815
+ prompt_logprobs=prompt_logprobs,
816
+ ignore_eos=self.ignore_eos,
817
+ max_tokens=max_tokens,
818
+ min_tokens=self.min_tokens,
819
+ skip_special_tokens=self.skip_special_tokens,
820
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
821
+ logits_processors=get_logits_processors(
822
+ self.logits_processors, logits_processor_pattern
823
+ ),
824
+ include_stop_str_in_output=self.include_stop_str_in_output,
825
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
826
+ output_kind=RequestOutputKind.DELTA
827
+ if self.stream
828
+ else RequestOutputKind.FINAL_ONLY,
829
+ structured_outputs=self.structured_outputs,
830
+ logit_bias=self.logit_bias,
831
+ bad_words=self.bad_words,
832
+ allowed_token_ids=self.allowed_token_ids,
833
+ extra_args=extra_args or None,
834
+ )
835
+
836
+ @model_validator(mode="before")
837
+ @classmethod
838
+ def validate_stream_options(cls, data):
839
+ if data.get("stream_options") and not data.get("stream"):
840
+ raise ValueError("Stream options can only be defined when `stream=True`.")
841
+
842
+ return data
843
+
844
+ @model_validator(mode="before")
845
+ @classmethod
846
+ def check_logprobs(cls, data):
847
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
848
+ if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
849
+ raise ValueError(
850
+ "`prompt_logprobs` are not available when `stream=True`."
851
+ )
852
+
853
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
854
+ raise ValueError("`prompt_logprobs` must be a positive value or -1.")
855
+ if (top_logprobs := data.get("top_logprobs")) is not None:
856
+ if top_logprobs < 0 and top_logprobs != -1:
857
+ raise ValueError("`top_logprobs` must be a positive value or -1.")
858
+
859
+ if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
860
+ raise ValueError(
861
+ "when using `top_logprobs`, `logprobs` must be set to true."
862
+ )
863
+
864
+ return data
865
+
866
+ @model_validator(mode="before")
867
+ @classmethod
868
+ def check_structured_outputs_count(cls, data):
869
+ if isinstance(data, ValueError):
870
+ raise data
871
+
872
+ if data.get("structured_outputs", None) is None:
873
+ return data
874
+
875
+ structured_outputs_kwargs = data["structured_outputs"]
876
+ count = sum(
877
+ structured_outputs_kwargs.get(k) is not None
878
+ for k in ("json", "regex", "choice")
879
+ )
880
+ # you can only use one kind of constraints for structured outputs
881
+ if count > 1:
882
+ raise ValueError(
883
+ "You can only use one kind of constraints for structured "
884
+ "outputs ('json', 'regex' or 'choice')."
885
+ )
886
+ # you can only either use structured outputs or tools, not both
887
+ if count > 1 and data.get("tool_choice", "none") not in (
888
+ "none",
889
+ "auto",
890
+ "required",
891
+ ):
892
+ raise ValueError(
893
+ "You can only either use constraints for structured outputs "
894
+ "or tools, not both."
895
+ )
896
+ return data
897
+
898
+ @model_validator(mode="before")
899
+ @classmethod
900
+ def check_tool_usage(cls, data):
901
+ # if "tool_choice" is not specified but tools are provided,
902
+ # default to "auto" tool_choice
903
+ if "tool_choice" not in data and data.get("tools"):
904
+ data["tool_choice"] = "auto"
905
+
906
+ # if "tool_choice" is "none" -- no validation is needed for tools
907
+ if "tool_choice" in data and data["tool_choice"] == "none":
908
+ return data
909
+
910
+ # if "tool_choice" is specified -- validation
911
+ if "tool_choice" in data and data["tool_choice"] is not None:
912
+ # ensure that if "tool choice" is specified, tools are present
913
+ if "tools" not in data or data["tools"] is None:
914
+ raise ValueError("When using `tool_choice`, `tools` must be set.")
915
+
916
+ # make sure that tool choice is either a named tool
917
+ # OR that it's set to "auto" or "required"
918
+ if data["tool_choice"] not in ["auto", "required"] and not isinstance(
919
+ data["tool_choice"], dict
920
+ ):
921
+ raise ValueError(
922
+ f"Invalid value for `tool_choice`: {data['tool_choice']}! "
923
+ 'Only named tools, "none", "auto" or "required" '
924
+ "are supported."
925
+ )
926
+
927
+ # if tool_choice is "required" but the "tools" list is empty,
928
+ # override the data to behave like "none" to align with
929
+ # OpenAI’s behavior.
930
+ if (
931
+ data["tool_choice"] == "required"
932
+ and isinstance(data["tools"], list)
933
+ and len(data["tools"]) == 0
934
+ ):
935
+ data["tool_choice"] = "none"
936
+ del data["tools"]
937
+ return data
938
+
939
+ # ensure that if "tool_choice" is specified as an object,
940
+ # it matches a valid tool
941
+ correct_usage_message = (
942
+ 'Correct usage: `{"type": "function",'
943
+ ' "function": {"name": "my_function"}}`'
944
+ )
945
+ if isinstance(data["tool_choice"], dict):
946
+ valid_tool = False
947
+ function = data["tool_choice"].get("function")
948
+ if not isinstance(function, dict):
949
+ raise ValueError(
950
+ f"Invalid value for `function`: `{function}` in "
951
+ f"`tool_choice`! {correct_usage_message}"
952
+ )
953
+ if "name" not in function:
954
+ raise ValueError(
955
+ f"Expected field `name` in `function` in "
956
+ f"`tool_choice`! {correct_usage_message}"
957
+ )
958
+ function_name = function["name"]
959
+ if not isinstance(function_name, str) or len(function_name) == 0:
960
+ raise ValueError(
961
+ f"Invalid `name` in `function`: `{function_name}`"
962
+ f" in `tool_choice`! {correct_usage_message}"
963
+ )
964
+ for tool in data["tools"]:
965
+ if tool["function"]["name"] == function_name:
966
+ valid_tool = True
967
+ break
968
+ if not valid_tool:
969
+ raise ValueError(
970
+ "The tool specified in `tool_choice` does not match any"
971
+ " of the specified `tools`"
972
+ )
973
+ return data
974
+
975
+ @model_validator(mode="before")
976
+ @classmethod
977
+ def check_generation_prompt(cls, data):
978
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
979
+ raise ValueError(
980
+ "Cannot set both `continue_final_message` and "
981
+ "`add_generation_prompt` to True."
982
+ )
983
+ return data
984
+
985
+ @model_validator(mode="before")
986
+ @classmethod
987
+ def check_cache_salt_support(cls, data):
988
+ if data.get("cache_salt") is not None and (
989
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
990
+ ):
991
+ raise ValueError(
992
+ "Parameter 'cache_salt' must be a non-empty string if provided."
993
+ )
994
+ return data
995
+
996
+
997
+ class CompletionRequest(OpenAIBaseModel):
998
+ # Ordered by official OpenAI API documentation
999
+ # https://platform.openai.com/docs/api-reference/completions/create
1000
+ model: str | None = None
1001
+ prompt: list[int] | list[list[int]] | str | list[str] | None = None
1002
+ echo: bool | None = False
1003
+ frequency_penalty: float | None = 0.0
1004
+ logit_bias: dict[str, float] | None = None
1005
+ logprobs: int | None = None
1006
+ max_tokens: int | None = 16
1007
+ n: int = 1
1008
+ presence_penalty: float | None = 0.0
1009
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1010
+ stop: str | list[str] | None = []
1011
+ stream: bool | None = False
1012
+ stream_options: StreamOptions | None = None
1013
+ suffix: str | None = None
1014
+ temperature: float | None = None
1015
+ top_p: float | None = None
1016
+ user: str | None = None
1017
+
1018
+ # --8<-- [start:completion-sampling-params]
1019
+ use_beam_search: bool = False
1020
+ top_k: int | None = None
1021
+ min_p: float | None = None
1022
+ repetition_penalty: float | None = None
1023
+ length_penalty: float = 1.0
1024
+ stop_token_ids: list[int] | None = []
1025
+ include_stop_str_in_output: bool = False
1026
+ ignore_eos: bool = False
1027
+ min_tokens: int = 0
1028
+ skip_special_tokens: bool = True
1029
+ spaces_between_special_tokens: bool = True
1030
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
1031
+ allowed_token_ids: list[int] | None = None
1032
+ prompt_logprobs: int | None = None
1033
+ # --8<-- [end:completion-sampling-params]
1034
+
1035
+ # --8<-- [start:completion-extra-params]
1036
+ prompt_embeds: bytes | list[bytes] | None = None
1037
+ add_special_tokens: bool = Field(
1038
+ default=True,
1039
+ description=(
1040
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1041
+ "the prompt."
1042
+ ),
1043
+ )
1044
+ response_format: AnyResponseFormat | None = Field(
1045
+ default=None,
1046
+ description=(
1047
+ "Similar to chat completion, this parameter specifies the format "
1048
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
1049
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
1050
+ ),
1051
+ )
1052
+ structured_outputs: StructuredOutputsParams | None = Field(
1053
+ default=None,
1054
+ description="Additional kwargs for structured outputs",
1055
+ )
1056
+ priority: int = Field(
1057
+ default=0,
1058
+ description=(
1059
+ "The priority of the request (lower means earlier handling; "
1060
+ "default: 0). Any priority other than 0 will raise an error "
1061
+ "if the served model does not use priority scheduling."
1062
+ ),
1063
+ )
1064
+ request_id: str = Field(
1065
+ default_factory=random_uuid,
1066
+ description=(
1067
+ "The request_id related to this request. If the caller does "
1068
+ "not set it, a random_uuid will be generated. This id is used "
1069
+ "through out the inference process and return in response."
1070
+ ),
1071
+ )
1072
+ logits_processors: LogitsProcessors | None = Field(
1073
+ default=None,
1074
+ description=(
1075
+ "A list of either qualified names of logits processors, or "
1076
+ "constructor objects, to apply when sampling. A constructor is "
1077
+ "a JSON object with a required 'qualname' field specifying the "
1078
+ "qualified name of the processor class/factory, and optional "
1079
+ "'args' and 'kwargs' fields containing positional and keyword "
1080
+ "arguments. For example: {'qualname': "
1081
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
1082
+ "{'param': 'value'}}."
1083
+ ),
1084
+ )
1085
+
1086
+ return_tokens_as_token_ids: bool | None = Field(
1087
+ default=None,
1088
+ description=(
1089
+ "If specified with 'logprobs', tokens are represented "
1090
+ " as strings of the form 'token_id:{token_id}' so that tokens "
1091
+ "that are not JSON-encodable can be identified."
1092
+ ),
1093
+ )
1094
+ return_token_ids: bool | None = Field(
1095
+ default=None,
1096
+ description=(
1097
+ "If specified, the result will include token IDs alongside the "
1098
+ "generated text. In streaming mode, prompt_token_ids is included "
1099
+ "only in the first chunk, and token_ids contains the delta tokens "
1100
+ "for each chunk. This is useful for debugging or when you "
1101
+ "need to map generated text back to input tokens."
1102
+ ),
1103
+ )
1104
+
1105
+ cache_salt: str | None = Field(
1106
+ default=None,
1107
+ description=(
1108
+ "If specified, the prefix cache will be salted with the provided "
1109
+ "string to prevent an attacker to guess prompts in multi-user "
1110
+ "environments. The salt should be random, protected from "
1111
+ "access by 3rd parties, and long enough to be "
1112
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
1113
+ "to 256 bit)."
1114
+ ),
1115
+ )
1116
+
1117
+ kv_transfer_params: dict[str, Any] | None = Field(
1118
+ default=None,
1119
+ description="KVTransfer parameters used for disaggregated serving.",
1120
+ )
1121
+
1122
+ vllm_xargs: dict[str, str | int | float] | None = Field(
1123
+ default=None,
1124
+ description=(
1125
+ "Additional request parameters with string or "
1126
+ "numeric values, used by custom extensions."
1127
+ ),
1128
+ )
1129
+
1130
+ # --8<-- [end:completion-extra-params]
1131
+
1132
+ # Default sampling parameters for completion requests
1133
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1134
+ "repetition_penalty": 1.0,
1135
+ "temperature": 1.0,
1136
+ "top_p": 1.0,
1137
+ "top_k": 0,
1138
+ "min_p": 0.0,
1139
+ }
1140
+
1141
+ def to_beam_search_params(
1142
+ self,
1143
+ max_tokens: int,
1144
+ default_sampling_params: dict | None = None,
1145
+ ) -> BeamSearchParams:
1146
+ if default_sampling_params is None:
1147
+ default_sampling_params = {}
1148
+ n = self.n if self.n is not None else 1
1149
+
1150
+ if (temperature := self.temperature) is None:
1151
+ temperature = default_sampling_params.get("temperature", 1.0)
1152
+
1153
+ return BeamSearchParams(
1154
+ beam_width=n,
1155
+ max_tokens=max_tokens,
1156
+ ignore_eos=self.ignore_eos,
1157
+ temperature=temperature,
1158
+ length_penalty=self.length_penalty,
1159
+ include_stop_str_in_output=self.include_stop_str_in_output,
1160
+ )
1161
+
1162
+ def to_sampling_params(
1163
+ self,
1164
+ max_tokens: int,
1165
+ logits_processor_pattern: str | None,
1166
+ default_sampling_params: dict | None = None,
1167
+ ) -> SamplingParams:
1168
+ if default_sampling_params is None:
1169
+ default_sampling_params = {}
1170
+
1171
+ # Default parameters
1172
+ if (repetition_penalty := self.repetition_penalty) is None:
1173
+ repetition_penalty = default_sampling_params.get(
1174
+ "repetition_penalty",
1175
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
1176
+ )
1177
+ if (temperature := self.temperature) is None:
1178
+ temperature = default_sampling_params.get(
1179
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
1180
+ )
1181
+ if (top_p := self.top_p) is None:
1182
+ top_p = default_sampling_params.get(
1183
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
1184
+ )
1185
+ if (top_k := self.top_k) is None:
1186
+ top_k = default_sampling_params.get(
1187
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
1188
+ )
1189
+ if (min_p := self.min_p) is None:
1190
+ min_p = default_sampling_params.get(
1191
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
1192
+ )
1193
+
1194
+ prompt_logprobs = self.prompt_logprobs
1195
+ if prompt_logprobs is None and self.echo:
1196
+ prompt_logprobs = self.logprobs
1197
+
1198
+ echo_without_generation = self.echo and self.max_tokens == 0
1199
+
1200
+ response_format = self.response_format
1201
+ if response_format is not None:
1202
+ # If structured outputs wasn't already enabled,
1203
+ # we must enable it for these features to work
1204
+ if self.structured_outputs is None:
1205
+ self.structured_outputs = StructuredOutputsParams()
1206
+
1207
+ # Set structured output params for response format
1208
+ if response_format.type == "json_object":
1209
+ self.structured_outputs.json_object = True
1210
+ elif response_format.type == "json_schema":
1211
+ json_schema = response_format.json_schema
1212
+ assert json_schema is not None
1213
+ self.structured_outputs.json = json_schema.json_schema
1214
+ elif response_format.type == "structural_tag":
1215
+ structural_tag = response_format
1216
+ assert structural_tag is not None and isinstance(
1217
+ structural_tag,
1218
+ (
1219
+ LegacyStructuralTagResponseFormat,
1220
+ StructuralTagResponseFormat,
1221
+ ),
1222
+ )
1223
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
1224
+ self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
1225
+
1226
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
1227
+ if self.kv_transfer_params:
1228
+ # Pass in kv_transfer_params via extra_args
1229
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
1230
+ return SamplingParams.from_optional(
1231
+ n=self.n,
1232
+ presence_penalty=self.presence_penalty,
1233
+ frequency_penalty=self.frequency_penalty,
1234
+ repetition_penalty=repetition_penalty,
1235
+ temperature=temperature,
1236
+ top_p=top_p,
1237
+ top_k=top_k,
1238
+ min_p=min_p,
1239
+ seed=self.seed,
1240
+ stop=self.stop,
1241
+ stop_token_ids=self.stop_token_ids,
1242
+ logprobs=self.logprobs,
1243
+ ignore_eos=self.ignore_eos,
1244
+ max_tokens=max_tokens if not echo_without_generation else 1,
1245
+ min_tokens=self.min_tokens,
1246
+ prompt_logprobs=prompt_logprobs,
1247
+ skip_special_tokens=self.skip_special_tokens,
1248
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
1249
+ include_stop_str_in_output=self.include_stop_str_in_output,
1250
+ logits_processors=get_logits_processors(
1251
+ self.logits_processors, logits_processor_pattern
1252
+ ),
1253
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1254
+ output_kind=RequestOutputKind.DELTA
1255
+ if self.stream
1256
+ else RequestOutputKind.FINAL_ONLY,
1257
+ structured_outputs=self.structured_outputs,
1258
+ logit_bias=self.logit_bias,
1259
+ allowed_token_ids=self.allowed_token_ids,
1260
+ extra_args=extra_args or None,
1261
+ )
1262
+
1263
+ @model_validator(mode="before")
1264
+ @classmethod
1265
+ def check_structured_outputs_count(cls, data):
1266
+ if data.get("structured_outputs", None) is None:
1267
+ return data
1268
+
1269
+ structured_outputs_kwargs = data["structured_outputs"]
1270
+ count = sum(
1271
+ structured_outputs_kwargs.get(k) is not None
1272
+ for k in ("json", "regex", "choice")
1273
+ )
1274
+ if count > 1:
1275
+ raise ValueError(
1276
+ "You can only use one kind of constraints for structured "
1277
+ "outputs ('json', 'regex' or 'choice')."
1278
+ )
1279
+ return data
1280
+
1281
+ @model_validator(mode="before")
1282
+ @classmethod
1283
+ def check_logprobs(cls, data):
1284
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1285
+ if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
1286
+ raise ValueError(
1287
+ "`prompt_logprobs` are not available when `stream=True`."
1288
+ )
1289
+
1290
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
1291
+ raise ValueError("`prompt_logprobs` must be a positive value or -1.")
1292
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1293
+ raise ValueError("`logprobs` must be a positive value.")
1294
+
1295
+ return data
1296
+
1297
+ @model_validator(mode="before")
1298
+ @classmethod
1299
+ def validate_stream_options(cls, data):
1300
+ if data.get("stream_options") and not data.get("stream"):
1301
+ raise ValueError("Stream options can only be defined when `stream=True`.")
1302
+
1303
+ return data
1304
+
1305
+ @model_validator(mode="before")
1306
+ @classmethod
1307
+ def validate_prompt_and_prompt_embeds(cls, data):
1308
+ prompt = data.get("prompt")
1309
+ prompt_embeds = data.get("prompt_embeds")
1310
+
1311
+ prompt_is_empty = prompt is None or (isinstance(prompt, str) and prompt == "")
1312
+ embeds_is_empty = prompt_embeds is None or (
1313
+ isinstance(prompt_embeds, list) and len(prompt_embeds) == 0
1314
+ )
1315
+
1316
+ if prompt_is_empty and embeds_is_empty:
1317
+ raise ValueError(
1318
+ "Either prompt or prompt_embeds must be provided and non-empty."
1319
+ )
1320
+
1321
+ return data
1322
+
1323
+ @model_validator(mode="before")
1324
+ @classmethod
1325
+ def check_cache_salt_support(cls, data):
1326
+ if data.get("cache_salt") is not None and (
1327
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
1328
+ ):
1329
+ raise ValueError(
1330
+ "Parameter 'cache_salt' must be a non-empty string if provided."
1331
+ )
1332
+ return data
1333
+
1334
+
1335
+ class CompletionLogProbs(OpenAIBaseModel):
1336
+ text_offset: list[int] = Field(default_factory=list)
1337
+ token_logprobs: list[float | None] = Field(default_factory=list)
1338
+ tokens: list[str] = Field(default_factory=list)
1339
+ top_logprobs: list[dict[str, float] | None] = Field(default_factory=list)
1340
+
1341
+
1342
+ class CompletionResponseChoice(OpenAIBaseModel):
1343
+ index: int
1344
+ text: str
1345
+ logprobs: CompletionLogProbs | None = None
1346
+ finish_reason: str | None = None
1347
+ stop_reason: int | str | None = Field(
1348
+ default=None,
1349
+ description=(
1350
+ "The stop string or token id that caused the completion "
1351
+ "to stop, None if the completion finished for some other reason "
1352
+ "including encountering the EOS token"
1353
+ ),
1354
+ )
1355
+ token_ids: list[int] | None = None # For response
1356
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
1357
+ prompt_token_ids: list[int] | None = None # For prompt
1358
+
1359
+
1360
+ class CompletionResponse(OpenAIBaseModel):
1361
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1362
+ object: Literal["text_completion"] = "text_completion"
1363
+ created: int = Field(default_factory=lambda: int(time.time()))
1364
+ model: str
1365
+ choices: list[CompletionResponseChoice]
1366
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
1367
+ system_fingerprint: str | None = None
1368
+ usage: UsageInfo
1369
+
1370
+ # vLLM-specific fields that are not in OpenAI spec
1371
+ kv_transfer_params: dict[str, Any] | None = Field(
1372
+ default=None, description="KVTransfer parameters."
1373
+ )
1374
+
1375
+
1376
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1377
+ index: int
1378
+ text: str
1379
+ logprobs: CompletionLogProbs | None = None
1380
+ finish_reason: str | None = None
1381
+ stop_reason: int | str | None = Field(
1382
+ default=None,
1383
+ description=(
1384
+ "The stop string or token id that caused the completion "
1385
+ "to stop, None if the completion finished for some other reason "
1386
+ "including encountering the EOS token"
1387
+ ),
1388
+ )
1389
+ # not part of the OpenAI spec but for tracing the tokens
1390
+ # prompt tokens is put into choice to align with CompletionResponseChoice
1391
+ prompt_token_ids: list[int] | None = None
1392
+ token_ids: list[int] | None = None
1393
+
1394
+
1395
+ class CompletionStreamResponse(OpenAIBaseModel):
1396
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1397
+ object: str = "text_completion"
1398
+ created: int = Field(default_factory=lambda: int(time.time()))
1399
+ model: str
1400
+ choices: list[CompletionResponseStreamChoice]
1401
+ usage: UsageInfo | None = Field(default=None)
1402
+
1403
+
1404
+ class FunctionCall(OpenAIBaseModel):
1405
+ name: str
1406
+ arguments: str
1407
+
1408
+
1409
+ class ToolCall(OpenAIBaseModel):
1410
+ id: str = Field(default_factory=make_tool_call_id)
1411
+ type: Literal["function"] = "function"
1412
+ function: FunctionCall
1413
+
1414
+
1415
+ class DeltaFunctionCall(BaseModel):
1416
+ name: str | None = None
1417
+ arguments: str | None = None
1418
+
1419
+
1420
+ # a tool call delta where everything is optional
1421
+ class DeltaToolCall(OpenAIBaseModel):
1422
+ id: str | None = None
1423
+ type: Literal["function"] | None = None
1424
+ index: int
1425
+ function: DeltaFunctionCall | None = None
1426
+
1427
+
1428
+ class ExtractedToolCallInformation(BaseModel):
1429
+ # indicate if tools were called
1430
+ tools_called: bool
1431
+
1432
+ # extracted tool calls
1433
+ tool_calls: list[ToolCall]
1434
+
1435
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1436
+ # But some models will do this intentionally
1437
+ content: str | None = None
1438
+
1439
+
1440
+ class ChatMessage(OpenAIBaseModel):
1441
+ role: str
1442
+ content: str | None = None
1443
+ refusal: str | None = None
1444
+ annotations: OpenAIAnnotation | None = None
1445
+ audio: OpenAIChatCompletionAudio | None = None
1446
+ function_call: FunctionCall | None = None
1447
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1448
+
1449
+ # vLLM-specific fields that are not in OpenAI spec
1450
+ reasoning: str | None = None
1451
+ reasoning_content: str | None = None
1452
+ """Deprecated: use `reasoning` instead."""
1453
+
1454
+ @model_validator(mode="after")
1455
+ def handle_deprecated_reasoning_content(self):
1456
+ """Copy reasoning to reasoning_content for backward compatibility."""
1457
+ self.reasoning_content = self.reasoning
1458
+ return self
1459
+
1460
+
1461
+ class ChatCompletionLogProb(OpenAIBaseModel):
1462
+ token: str
1463
+ logprob: float = -9999.0
1464
+ bytes: list[int] | None = None
1465
+
1466
+
1467
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1468
+ # Workaround: redefine fields name cache so that it's not
1469
+ # shared with the super class.
1470
+ field_names: ClassVar[set[str] | None] = None
1471
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1472
+
1473
+
1474
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1475
+ content: list[ChatCompletionLogProbsContent] | None = None
1476
+
1477
+
1478
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1479
+ index: int
1480
+ message: ChatMessage
1481
+ logprobs: ChatCompletionLogProbs | None = None
1482
+ # per OpenAI spec this is the default
1483
+ finish_reason: str | None = "stop"
1484
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1485
+ stop_reason: int | str | None = None
1486
+ # not part of the OpenAI spec but is useful for tracing the tokens
1487
+ # in agent scenarios
1488
+ token_ids: list[int] | None = None
1489
+
1490
+
1491
+ class ChatCompletionResponse(OpenAIBaseModel):
1492
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1493
+ object: Literal["chat.completion"] = "chat.completion"
1494
+ created: int = Field(default_factory=lambda: int(time.time()))
1495
+ model: str
1496
+ choices: list[ChatCompletionResponseChoice]
1497
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
1498
+ system_fingerprint: str | None = None
1499
+ usage: UsageInfo
1500
+
1501
+ # vLLM-specific fields that are not in OpenAI spec
1502
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
1503
+ prompt_token_ids: list[int] | None = None
1504
+ kv_transfer_params: dict[str, Any] | None = Field(
1505
+ default=None, description="KVTransfer parameters."
1506
+ )
1507
+
1508
+
1509
+ class DeltaMessage(OpenAIBaseModel):
1510
+ role: str | None = None
1511
+ content: str | None = None
1512
+ reasoning: str | None = None
1513
+ reasoning_content: str | None = None
1514
+ """Deprecated: use `reasoning` instead."""
1515
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1516
+
1517
+ @model_validator(mode="after")
1518
+ def handle_deprecated_reasoning_content(self):
1519
+ """Copy reasoning to reasoning_content for backward compatibility."""
1520
+ self.reasoning_content = self.reasoning
1521
+ return self
1522
+
1523
+
1524
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1525
+ index: int
1526
+ delta: DeltaMessage
1527
+ logprobs: ChatCompletionLogProbs | None = None
1528
+ finish_reason: str | None = None
1529
+ stop_reason: int | str | None = None
1530
+ # not part of the OpenAI spec but for tracing the tokens
1531
+ token_ids: list[int] | None = None
1532
+
1533
+
1534
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1535
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1536
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1537
+ created: int = Field(default_factory=lambda: int(time.time()))
1538
+ model: str
1539
+ choices: list[ChatCompletionResponseStreamChoice]
1540
+ usage: UsageInfo | None = Field(default=None)
1541
+ # not part of the OpenAI spec but for tracing the tokens
1542
+ prompt_token_ids: list[int] | None = None
1543
+
1544
+
1545
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1546
+ delta: DeltaMessage
1547
+ finish_reason: str | None = None
1548
+ stop_reason: int | str | None = None
1549
+
1550
+
1551
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1552
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1553
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1554
+ created: int = Field(default_factory=lambda: int(time.time()))
1555
+ model: str
1556
+ choices: list[TranscriptionResponseStreamChoice]
1557
+ usage: UsageInfo | None = Field(default=None)
1558
+
1559
+
1560
+ class InputTokensDetails(OpenAIBaseModel):
1561
+ cached_tokens: int
1562
+ input_tokens_per_turn: list[int] = Field(default_factory=list)
1563
+ cached_tokens_per_turn: list[int] = Field(default_factory=list)
1564
+
1565
+
1566
+ class OutputTokensDetails(OpenAIBaseModel):
1567
+ reasoning_tokens: int = 0
1568
+ tool_output_tokens: int = 0
1569
+ output_tokens_per_turn: list[int] = Field(default_factory=list)
1570
+ tool_output_tokens_per_turn: list[int] = Field(default_factory=list)
1571
+
1572
+
1573
+ class ResponseUsage(OpenAIBaseModel):
1574
+ input_tokens: int
1575
+ input_tokens_details: InputTokensDetails
1576
+ output_tokens: int
1577
+ output_tokens_details: OutputTokensDetails
1578
+ total_tokens: int
1579
+
1580
+
1581
+ def serialize_message(msg):
1582
+ """
1583
+ Serializes a single message
1584
+ """
1585
+ if isinstance(msg, dict):
1586
+ return msg
1587
+ elif hasattr(msg, "to_dict"):
1588
+ return msg.to_dict()
1589
+ else:
1590
+ # fallback to pyandic dump
1591
+ return msg.model_dump_json()
1592
+
1593
+
1594
+ def serialize_messages(msgs):
1595
+ """
1596
+ Serializes multiple messages
1597
+ """
1598
+ return [serialize_message(msg) for msg in msgs] if msgs else None
1599
+
1600
+
1601
+ class ResponsesResponse(OpenAIBaseModel):
1602
+ id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
1603
+ created_at: int = Field(default_factory=lambda: int(time.time()))
1604
+ # error: Optional[ResponseError] = None
1605
+ incomplete_details: IncompleteDetails | None = None
1606
+ instructions: str | None = None
1607
+ metadata: Metadata | None = None
1608
+ model: str
1609
+ object: Literal["response"] = "response"
1610
+ output: list[ResponseOutputItem]
1611
+ parallel_tool_calls: bool
1612
+ temperature: float
1613
+ tool_choice: ToolChoice
1614
+ tools: list[Tool]
1615
+ top_p: float
1616
+ background: bool
1617
+ max_output_tokens: int
1618
+ max_tool_calls: int | None = None
1619
+ previous_response_id: str | None = None
1620
+ prompt: ResponsePrompt | None = None
1621
+ reasoning: Reasoning | None = None
1622
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"]
1623
+ status: ResponseStatus
1624
+ text: ResponseTextConfig | None = None
1625
+ top_logprobs: int | None = None
1626
+ truncation: Literal["auto", "disabled"]
1627
+ usage: ResponseUsage | None = None
1628
+ user: str | None = None
1629
+
1630
+ # --8<-- [start:responses-extra-params]
1631
+ # These are populated when enable_response_messages is set to True
1632
+ # NOTE: custom serialization is needed
1633
+ # see serialize_input_messages and serialize_output_messages
1634
+ input_messages: list[ChatCompletionMessageParam] | None = None
1635
+ output_messages: list[ChatCompletionMessageParam] | None = None
1636
+ # --8<-- [end:responses-extra-params]
1637
+
1638
+ # NOTE: openAI harmony doesn't serialize TextContent properly,
1639
+ # TODO: this fixes for TextContent, but need to verify for tools etc
1640
+ # https://github.com/openai/harmony/issues/78
1641
+ @field_serializer("output_messages", when_used="json")
1642
+ def serialize_output_messages(self, msgs, _info):
1643
+ return serialize_messages(msgs)
1644
+
1645
+ # NOTE: openAI harmony doesn't serialize TextContent properly, this fixes it
1646
+ # https://github.com/openai/harmony/issues/78
1647
+ @field_serializer("input_messages", when_used="json")
1648
+ def serialize_input_messages(self, msgs, _info):
1649
+ return serialize_messages(msgs)
1650
+
1651
+ @classmethod
1652
+ def from_request(
1653
+ cls,
1654
+ request: ResponsesRequest,
1655
+ sampling_params: SamplingParams,
1656
+ model_name: str,
1657
+ created_time: int,
1658
+ output: list[ResponseOutputItem],
1659
+ status: ResponseStatus,
1660
+ usage: ResponseUsage | None = None,
1661
+ input_messages: list[ChatCompletionMessageParam] | None = None,
1662
+ output_messages: list[ChatCompletionMessageParam] | None = None,
1663
+ ) -> "ResponsesResponse":
1664
+ incomplete_details: IncompleteDetails | None = None
1665
+ if status == "incomplete":
1666
+ incomplete_details = IncompleteDetails(reason="max_output_tokens")
1667
+ # TODO: implement the other reason for incomplete_details,
1668
+ # which is content_filter
1669
+ # incomplete_details = IncompleteDetails(reason='content_filter')
1670
+ return cls(
1671
+ id=request.request_id,
1672
+ created_at=created_time,
1673
+ incomplete_details=incomplete_details,
1674
+ instructions=request.instructions,
1675
+ metadata=request.metadata,
1676
+ model=model_name,
1677
+ output=output,
1678
+ input_messages=input_messages,
1679
+ output_messages=output_messages,
1680
+ parallel_tool_calls=request.parallel_tool_calls,
1681
+ temperature=sampling_params.temperature,
1682
+ tool_choice=request.tool_choice,
1683
+ tools=request.tools,
1684
+ top_p=sampling_params.top_p,
1685
+ background=request.background,
1686
+ max_output_tokens=sampling_params.max_tokens,
1687
+ max_tool_calls=request.max_tool_calls,
1688
+ previous_response_id=request.previous_response_id,
1689
+ prompt=request.prompt,
1690
+ reasoning=request.reasoning,
1691
+ service_tier=request.service_tier,
1692
+ status=status,
1693
+ text=request.text,
1694
+ top_logprobs=sampling_params.logprobs,
1695
+ truncation=request.truncation,
1696
+ user=request.user,
1697
+ usage=usage,
1698
+ )
1699
+
1700
+
1701
+ # TODO: this code can be removed once
1702
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
1703
+ class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
1704
+ content_index: int
1705
+ """The index of the content part that is done."""
1706
+
1707
+ item_id: str
1708
+ """The ID of the output item that the content part was added to."""
1709
+
1710
+ output_index: int
1711
+ """The index of the output item that the content part was added to."""
1712
+
1713
+ part: ResponseReasoningTextContent
1714
+ """The content part that is done."""
1715
+
1716
+ sequence_number: int
1717
+ """The sequence number of this event."""
1718
+
1719
+ type: Literal["response.reasoning_part.done"]
1720
+ """The type of the event. Always `response.reasoning_part.done`."""
1721
+
1722
+
1723
+ # TODO: this code can be removed once
1724
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
1725
+ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
1726
+ content_index: int
1727
+ """The index of the content part that is done."""
1728
+
1729
+ item_id: str
1730
+ """The ID of the output item that the content part was added to."""
1731
+
1732
+ output_index: int
1733
+ """The index of the output item that the content part was added to."""
1734
+
1735
+ part: ResponseReasoningTextContent
1736
+ """The content part that is done."""
1737
+
1738
+ sequence_number: int
1739
+ """The sequence number of this event."""
1740
+
1741
+ type: Literal["response.reasoning_part.added"]
1742
+ """The type of the event. Always `response.reasoning_part.added`."""
1743
+
1744
+
1745
+ # vLLM Streaming Events
1746
+ # Note: we override the response type with the vLLM ResponsesResponse type
1747
+ class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
1748
+ response: ResponsesResponse # type: ignore[override]
1749
+
1750
+
1751
+ class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
1752
+ response: ResponsesResponse # type: ignore[override]
1753
+
1754
+
1755
+ class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
1756
+ response: ResponsesResponse # type: ignore[override]
1757
+
1758
+
1759
+ StreamingResponsesResponse: TypeAlias = (
1760
+ ResponseCreatedEvent
1761
+ | ResponseInProgressEvent
1762
+ | ResponseCompletedEvent
1763
+ | ResponseOutputItemAddedEvent
1764
+ | ResponseOutputItemDoneEvent
1765
+ | ResponseContentPartAddedEvent
1766
+ | ResponseContentPartDoneEvent
1767
+ | ResponseReasoningTextDeltaEvent
1768
+ | ResponseReasoningTextDoneEvent
1769
+ | ResponseReasoningPartAddedEvent
1770
+ | ResponseReasoningPartDoneEvent
1771
+ | ResponseCodeInterpreterCallInProgressEvent
1772
+ | ResponseCodeInterpreterCallCodeDeltaEvent
1773
+ | ResponseWebSearchCallInProgressEvent
1774
+ | ResponseWebSearchCallSearchingEvent
1775
+ | ResponseWebSearchCallCompletedEvent
1776
+ | ResponseCodeInterpreterCallCodeDoneEvent
1777
+ | ResponseCodeInterpreterCallInterpretingEvent
1778
+ | ResponseCodeInterpreterCallCompletedEvent
1779
+ )
1780
+
1781
+
1782
+ class TokenizeCompletionRequest(OpenAIBaseModel):
1783
+ model: str | None = None
1784
+ prompt: str
1785
+
1786
+ add_special_tokens: bool = Field(
1787
+ default=True,
1788
+ description=(
1789
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1790
+ "the prompt."
1791
+ ),
1792
+ )
1793
+ return_token_strs: bool | None = Field(
1794
+ default=False,
1795
+ description=(
1796
+ "If true, also return the token strings corresponding to the token ids."
1797
+ ),
1798
+ )
1799
+
1800
+
1801
+ class TokenizeChatRequest(OpenAIBaseModel):
1802
+ model: str | None = None
1803
+ messages: list[ChatCompletionMessageParam]
1804
+
1805
+ add_generation_prompt: bool = Field(
1806
+ default=True,
1807
+ description=(
1808
+ "If true, the generation prompt will be added to the chat template. "
1809
+ "This is a parameter used by chat template in tokenizer config of the "
1810
+ "model."
1811
+ ),
1812
+ )
1813
+ return_token_strs: bool | None = Field(
1814
+ default=False,
1815
+ description=(
1816
+ "If true, also return the token strings corresponding to the token ids."
1817
+ ),
1818
+ )
1819
+ continue_final_message: bool = Field(
1820
+ default=False,
1821
+ description=(
1822
+ "If this is set, the chat will be formatted so that the final "
1823
+ "message in the chat is open-ended, without any EOS tokens. The "
1824
+ "model will continue this message rather than starting a new one. "
1825
+ 'This allows you to "prefill" part of the model\'s response for it. '
1826
+ "Cannot be used at the same time as `add_generation_prompt`."
1827
+ ),
1828
+ )
1829
+ add_special_tokens: bool = Field(
1830
+ default=False,
1831
+ description=(
1832
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1833
+ "on top of what is added by the chat template. "
1834
+ "For most models, the chat template takes care of adding the "
1835
+ "special tokens so this should be set to false (as is the "
1836
+ "default)."
1837
+ ),
1838
+ )
1839
+ chat_template: str | None = Field(
1840
+ default=None,
1841
+ description=(
1842
+ "A Jinja template to use for this conversion. "
1843
+ "As of transformers v4.44, default chat template is no longer "
1844
+ "allowed, so you must provide a chat template if the tokenizer "
1845
+ "does not define one."
1846
+ ),
1847
+ )
1848
+ chat_template_kwargs: dict[str, Any] | None = Field(
1849
+ default=None,
1850
+ description=(
1851
+ "Additional keyword args to pass to the template renderer. "
1852
+ "Will be accessible by the chat template."
1853
+ ),
1854
+ )
1855
+ mm_processor_kwargs: dict[str, Any] | None = Field(
1856
+ default=None,
1857
+ description=("Additional kwargs to pass to the HF processor."),
1858
+ )
1859
+ tools: list[ChatCompletionToolsParam] | None = Field(
1860
+ default=None,
1861
+ description=("A list of tools the model may call."),
1862
+ )
1863
+
1864
+ @model_validator(mode="before")
1865
+ @classmethod
1866
+ def check_generation_prompt(cls, data):
1867
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
1868
+ raise ValueError(
1869
+ "Cannot set both `continue_final_message` and "
1870
+ "`add_generation_prompt` to True."
1871
+ )
1872
+ return data
1873
+
1874
+
1875
+ TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
1876
+
1877
+
1878
+ class TokenizeResponse(OpenAIBaseModel):
1879
+ count: int
1880
+ max_model_len: int
1881
+ tokens: list[int]
1882
+ token_strs: list[str] | None = None
1883
+
1884
+
1885
+ class DetokenizeRequest(OpenAIBaseModel):
1886
+ model: str | None = None
1887
+ tokens: list[int]
1888
+
1889
+
1890
+ class DetokenizeResponse(OpenAIBaseModel):
1891
+ prompt: str
1892
+
1893
+
1894
+ class TokenizerInfoResponse(OpenAIBaseModel):
1895
+ """
1896
+ Response containing tokenizer configuration
1897
+ equivalent to tokenizer_config.json
1898
+ """
1899
+
1900
+ model_config = ConfigDict(extra="allow")
1901
+ tokenizer_class: str
1902
+
1903
+
1904
+ class LoadLoRAAdapterRequest(BaseModel):
1905
+ lora_name: str
1906
+ lora_path: str
1907
+
1908
+
1909
+ class UnloadLoRAAdapterRequest(BaseModel):
1910
+ lora_name: str
1911
+ lora_int_id: int | None = Field(default=None)
1912
+
1913
+
1914
+ ## Protocols for Audio
1915
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
1916
+
1917
+
1918
+ class TranscriptionRequest(OpenAIBaseModel):
1919
+ # Ordered by official OpenAI API documentation
1920
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
1921
+
1922
+ file: UploadFile
1923
+ """
1924
+ The audio file object (not file name) to transcribe, in one of these
1925
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
1926
+ """
1927
+
1928
+ model: str | None = None
1929
+ """ID of the model to use.
1930
+ """
1931
+
1932
+ language: str | None = None
1933
+ """The language of the input audio.
1934
+
1935
+ Supplying the input language in
1936
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
1937
+ will improve accuracy and latency.
1938
+ """
1939
+
1940
+ prompt: str = Field(default="")
1941
+ """An optional text to guide the model's style or continue a previous audio
1942
+ segment.
1943
+
1944
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1945
+ should match the audio language.
1946
+ """
1947
+
1948
+ response_format: AudioResponseFormat = Field(default="json")
1949
+ """
1950
+ The format of the output, in one of these options: `json`, `text`, `srt`,
1951
+ `verbose_json`, or `vtt`.
1952
+ """
1953
+
1954
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
1955
+
1956
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
1957
+ alias="timestamp_granularities[]", default=[]
1958
+ )
1959
+ """The timestamp granularities to populate for this transcription.
1960
+
1961
+ `response_format` must be set `verbose_json` to use timestamp granularities.
1962
+ Either or both of these options are supported: `word`, or `segment`. Note:
1963
+ There is no additional latency for segment timestamps, but generating word
1964
+ timestamps incurs additional latency.
1965
+ """
1966
+
1967
+ stream: bool | None = False
1968
+ """When set, it will enable output to be streamed in a similar fashion
1969
+ as the Chat Completion endpoint.
1970
+ """
1971
+ # --8<-- [start:transcription-extra-params]
1972
+ # Flattened stream option to simplify form data.
1973
+ stream_include_usage: bool | None = False
1974
+ stream_continuous_usage_stats: bool | None = False
1975
+
1976
+ vllm_xargs: dict[str, str | int | float] | None = Field(
1977
+ default=None,
1978
+ description=(
1979
+ "Additional request parameters with string or "
1980
+ "numeric values, used by custom extensions."
1981
+ ),
1982
+ )
1983
+ # --8<-- [end:transcription-extra-params]
1984
+
1985
+ to_language: str | None = None
1986
+ """The language of the output audio we transcribe to.
1987
+
1988
+ Please note that this is not currently used by supported models at this
1989
+ time, but it is a placeholder for future use, matching translation api.
1990
+ """
1991
+
1992
+ # --8<-- [start:transcription-sampling-params]
1993
+ temperature: float = Field(default=0.0)
1994
+ """The sampling temperature, between 0 and 1.
1995
+
1996
+ Higher values like 0.8 will make the output more random, while lower values
1997
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
1998
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
1999
+ to automatically increase the temperature until certain thresholds are hit.
2000
+ """
2001
+
2002
+ top_p: float | None = None
2003
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
2004
+ smallest possible set whose cumulative probability exceeds `p`.
2005
+ """
2006
+
2007
+ top_k: int | None = None
2008
+ """Limits sampling to the `k` most probable tokens at each step."""
2009
+
2010
+ min_p: float | None = None
2011
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
2012
+ minimum likelihood threshold during sampling.
2013
+ """
2014
+
2015
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2016
+ """The seed to use for sampling."""
2017
+
2018
+ frequency_penalty: float | None = 0.0
2019
+ """The frequency penalty to use for sampling."""
2020
+
2021
+ repetition_penalty: float | None = None
2022
+ """The repetition penalty to use for sampling."""
2023
+
2024
+ presence_penalty: float | None = 0.0
2025
+ """The presence penalty to use for sampling."""
2026
+ # --8<-- [end:transcription-sampling-params]
2027
+
2028
+ # Default sampling parameters for transcription requests.
2029
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2030
+ "repetition_penalty": 1.0,
2031
+ "temperature": 1.0,
2032
+ "top_p": 1.0,
2033
+ "top_k": 0,
2034
+ "min_p": 0.0,
2035
+ }
2036
+
2037
+ def to_sampling_params(
2038
+ self, default_max_tokens: int, default_sampling_params: dict | None = None
2039
+ ) -> SamplingParams:
2040
+ max_tokens = default_max_tokens
2041
+
2042
+ if default_sampling_params is None:
2043
+ default_sampling_params = {}
2044
+
2045
+ # Default parameters
2046
+ if (temperature := self.temperature) is None:
2047
+ temperature = default_sampling_params.get(
2048
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
2049
+ )
2050
+ if (top_p := self.top_p) is None:
2051
+ top_p = default_sampling_params.get(
2052
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
2053
+ )
2054
+ if (top_k := self.top_k) is None:
2055
+ top_k = default_sampling_params.get(
2056
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
2057
+ )
2058
+ if (min_p := self.min_p) is None:
2059
+ min_p = default_sampling_params.get(
2060
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
2061
+ )
2062
+
2063
+ if (repetition_penalty := self.repetition_penalty) is None:
2064
+ repetition_penalty = default_sampling_params.get(
2065
+ "repetition_penalty",
2066
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
2067
+ )
2068
+
2069
+ return SamplingParams.from_optional(
2070
+ temperature=temperature,
2071
+ max_tokens=max_tokens,
2072
+ seed=self.seed,
2073
+ top_p=top_p,
2074
+ top_k=top_k,
2075
+ min_p=min_p,
2076
+ frequency_penalty=self.frequency_penalty,
2077
+ repetition_penalty=repetition_penalty,
2078
+ presence_penalty=self.presence_penalty,
2079
+ output_kind=RequestOutputKind.DELTA
2080
+ if self.stream
2081
+ else RequestOutputKind.FINAL_ONLY,
2082
+ extra_args=self.vllm_xargs,
2083
+ )
2084
+
2085
+ @model_validator(mode="before")
2086
+ @classmethod
2087
+ def validate_transcription_request(cls, data):
2088
+ if isinstance(data.get("file"), str):
2089
+ raise HTTPException(
2090
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
2091
+ detail="Expected 'file' to be a file-like object, not 'str'.",
2092
+ )
2093
+
2094
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2095
+ stream = data.get("stream", False)
2096
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2097
+ raise ValueError("Stream options can only be defined when `stream=True`.")
2098
+
2099
+ return data
2100
+
2101
+
2102
+ # Transcription response objects
2103
+ class TranscriptionUsageAudio(OpenAIBaseModel):
2104
+ type: Literal["duration"] = "duration"
2105
+ seconds: int
2106
+
2107
+
2108
+ class TranscriptionResponse(OpenAIBaseModel):
2109
+ text: str
2110
+ """The transcribed text."""
2111
+ usage: TranscriptionUsageAudio
2112
+
2113
+
2114
+ class TranscriptionWord(OpenAIBaseModel):
2115
+ end: float
2116
+ """End time of the word in seconds."""
2117
+
2118
+ start: float
2119
+ """Start time of the word in seconds."""
2120
+
2121
+ word: str
2122
+ """The text content of the word."""
2123
+
2124
+
2125
+ class TranscriptionSegment(OpenAIBaseModel):
2126
+ id: int
2127
+ """Unique identifier of the segment."""
2128
+
2129
+ avg_logprob: float | None = None
2130
+ """Average logprob of the segment.
2131
+
2132
+ If the value is lower than -1, consider the logprobs failed.
2133
+ """
2134
+
2135
+ compression_ratio: float | None = None
2136
+ """Compression ratio of the segment.
2137
+
2138
+ If the value is greater than 2.4, consider the compression failed.
2139
+ """
2140
+
2141
+ end: float
2142
+ """End time of the segment in seconds."""
2143
+
2144
+ no_speech_prob: float | None = None
2145
+ """Probability of no speech in the segment.
2146
+
2147
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2148
+ this segment silent.
2149
+ """
2150
+
2151
+ seek: int
2152
+ """Seek offset of the segment."""
2153
+
2154
+ start: float
2155
+ """Start time of the segment in seconds."""
2156
+
2157
+ temperature: float
2158
+ """Temperature parameter used for generating the segment."""
2159
+
2160
+ text: str
2161
+ """Text content of the segment."""
2162
+
2163
+ tokens: list[int]
2164
+ """Array of token IDs for the text content."""
2165
+
2166
+
2167
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
2168
+ duration: str
2169
+ """The duration of the input audio."""
2170
+
2171
+ language: str
2172
+ """The language of the input audio."""
2173
+
2174
+ text: str
2175
+ """The transcribed text."""
2176
+
2177
+ segments: list[TranscriptionSegment] | None = None
2178
+ """Segments of the transcribed text and their corresponding details."""
2179
+
2180
+ words: list[TranscriptionWord] | None = None
2181
+ """Extracted words and their corresponding timestamps."""
2182
+
2183
+
2184
+ TranscriptionResponseVariant: TypeAlias = (
2185
+ TranscriptionResponse | TranscriptionResponseVerbose
2186
+ )
2187
+
2188
+
2189
+ class TranslationResponseStreamChoice(OpenAIBaseModel):
2190
+ delta: DeltaMessage
2191
+ finish_reason: str | None = None
2192
+ stop_reason: int | str | None = None
2193
+
2194
+
2195
+ class TranslationStreamResponse(OpenAIBaseModel):
2196
+ id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
2197
+ object: Literal["translation.chunk"] = "translation.chunk"
2198
+ created: int = Field(default_factory=lambda: int(time.time()))
2199
+ model: str
2200
+ choices: list[TranslationResponseStreamChoice]
2201
+ usage: UsageInfo | None = Field(default=None)
2202
+
2203
+
2204
+ class TranslationRequest(OpenAIBaseModel):
2205
+ # Ordered by official OpenAI API documentation
2206
+ # https://platform.openai.com/docs/api-reference/audio/createTranslation
2207
+
2208
+ file: UploadFile
2209
+ """
2210
+ The audio file object (not file name) to translate, in one of these
2211
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2212
+ """
2213
+
2214
+ model: str | None = None
2215
+ """ID of the model to use.
2216
+ """
2217
+
2218
+ prompt: str = Field(default="")
2219
+ """An optional text to guide the model's style or continue a previous audio
2220
+ segment.
2221
+
2222
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2223
+ should match the audio language.
2224
+ """
2225
+
2226
+ response_format: AudioResponseFormat = Field(default="json")
2227
+ """
2228
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2229
+ `verbose_json`, or `vtt`.
2230
+ """
2231
+
2232
+ # TODO support additional sampling parameters
2233
+ # --8<-- [start:translation-sampling-params]
2234
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2235
+ """The seed to use for sampling."""
2236
+
2237
+ temperature: float = Field(default=0.0)
2238
+ """The sampling temperature, between 0 and 1.
2239
+
2240
+ Higher values like 0.8 will make the output more random, while lower values
2241
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2242
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2243
+ to automatically increase the temperature until certain thresholds are hit.
2244
+ """
2245
+ # --8<-- [end:translation-sampling-params]
2246
+
2247
+ # --8<-- [start:translation-extra-params]
2248
+ language: str | None = None
2249
+ """The language of the input audio we translate from.
2250
+
2251
+ Supplying the input language in
2252
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2253
+ will improve accuracy.
2254
+ """
2255
+
2256
+ to_language: str | None = None
2257
+ """The language of the input audio we translate to.
2258
+
2259
+ Please note that this is not supported by all models, refer to the specific
2260
+ model documentation for more details.
2261
+ For instance, Whisper only supports `to_language=en`.
2262
+ """
2263
+
2264
+ stream: bool | None = False
2265
+ """Custom field not present in the original OpenAI definition. When set,
2266
+ it will enable output to be streamed in a similar fashion as the Chat
2267
+ Completion endpoint.
2268
+ """
2269
+ # Flattened stream option to simplify form data.
2270
+ stream_include_usage: bool | None = False
2271
+ stream_continuous_usage_stats: bool | None = False
2272
+ # --8<-- [end:translation-extra-params]
2273
+
2274
+ # Default sampling parameters for translation requests.
2275
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2276
+ "temperature": 0,
2277
+ }
2278
+
2279
+ def to_sampling_params(
2280
+ self, default_max_tokens: int, default_sampling_params: dict | None = None
2281
+ ) -> SamplingParams:
2282
+ max_tokens = default_max_tokens
2283
+
2284
+ if default_sampling_params is None:
2285
+ default_sampling_params = {}
2286
+ # Default parameters
2287
+ if (temperature := self.temperature) is None:
2288
+ temperature = default_sampling_params.get(
2289
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
2290
+ )
2291
+
2292
+ return SamplingParams.from_optional(
2293
+ temperature=temperature,
2294
+ max_tokens=max_tokens,
2295
+ seed=self.seed,
2296
+ output_kind=RequestOutputKind.DELTA
2297
+ if self.stream
2298
+ else RequestOutputKind.FINAL_ONLY,
2299
+ )
2300
+
2301
+ @model_validator(mode="before")
2302
+ @classmethod
2303
+ def validate_stream_options(cls, data):
2304
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2305
+ stream = data.get("stream", False)
2306
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2307
+ raise ValueError("Stream options can only be defined when `stream=True`.")
2308
+
2309
+ return data
2310
+
2311
+
2312
+ # Translation response objects
2313
+ class TranslationResponse(OpenAIBaseModel):
2314
+ text: str
2315
+ """The translated text."""
2316
+
2317
+
2318
+ class TranslationWord(OpenAIBaseModel):
2319
+ end: float
2320
+ """End time of the word in seconds."""
2321
+
2322
+ start: float
2323
+ """Start time of the word in seconds."""
2324
+
2325
+ word: str
2326
+ """The text content of the word."""
2327
+
2328
+
2329
+ class TranslationSegment(OpenAIBaseModel):
2330
+ id: int
2331
+ """Unique identifier of the segment."""
2332
+
2333
+ avg_logprob: float | None = None
2334
+ """Average logprob of the segment.
2335
+
2336
+ If the value is lower than -1, consider the logprobs failed.
2337
+ """
2338
+
2339
+ compression_ratio: float | None = None
2340
+ """Compression ratio of the segment.
2341
+
2342
+ If the value is greater than 2.4, consider the compression failed.
2343
+ """
2344
+
2345
+ end: float
2346
+ """End time of the segment in seconds."""
2347
+
2348
+ no_speech_prob: float | None = None
2349
+ """Probability of no speech in the segment.
2350
+
2351
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2352
+ this segment silent.
2353
+ """
2354
+
2355
+ seek: int
2356
+ """Seek offset of the segment."""
2357
+
2358
+ start: float
2359
+ """Start time of the segment in seconds."""
2360
+
2361
+ temperature: float
2362
+ """Temperature parameter used for generating the segment."""
2363
+
2364
+ text: str
2365
+ """Text content of the segment."""
2366
+
2367
+ tokens: list[int]
2368
+ """Array of token IDs for the text content."""
2369
+
2370
+
2371
+ class TranslationResponseVerbose(OpenAIBaseModel):
2372
+ duration: str
2373
+ """The duration of the input audio."""
2374
+
2375
+ language: str
2376
+ """The language of the input audio."""
2377
+
2378
+ text: str
2379
+ """The translated text."""
2380
+
2381
+ segments: list[TranslationSegment] | None = None
2382
+ """Segments of the translated text and their corresponding details."""
2383
+
2384
+ words: list[TranslationWord] | None = None
2385
+ """Extracted words and their corresponding timestamps."""
2386
+
2387
+
2388
+ TranslationResponseVariant: TypeAlias = TranslationResponse | TranslationResponseVerbose
2389
+
2390
+
2391
+ ####### Tokens IN <> Tokens OUT #######
2392
+ class GenerateRequest(BaseModel):
2393
+ request_id: str = Field(
2394
+ default_factory=random_uuid,
2395
+ description=(
2396
+ "The request_id related to this request. If the caller does "
2397
+ "not set it, a random_uuid will be generated. This id is used "
2398
+ "through out the inference process and return in response."
2399
+ ),
2400
+ )
2401
+ token_ids: list[int]
2402
+ """The token ids to generate text from."""
2403
+
2404
+ # features: MultiModalFeatureSpec
2405
+ # TODO (NickLucche): implement once Renderer work is completed
2406
+ features: str | None = None
2407
+ """The processed MM inputs for the model."""
2408
+
2409
+ sampling_params: SamplingParams
2410
+ """The sampling parameters for the model."""
2411
+
2412
+ model: str | None = None
2413
+
2414
+ stream: bool | None = False
2415
+ stream_options: StreamOptions | None = None
2416
+ cache_salt: str | None = Field(
2417
+ default=None,
2418
+ description=(
2419
+ "If specified, the prefix cache will be salted with the provided "
2420
+ "string to prevent an attacker to guess prompts in multi-user "
2421
+ "environments. The salt should be random, protected from "
2422
+ "access by 3rd parties, and long enough to be "
2423
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
2424
+ "to 256 bit)."
2425
+ ),
2426
+ )
2427
+ priority: int = Field(
2428
+ default=0,
2429
+ description=(
2430
+ "The priority of the request (lower means earlier handling; "
2431
+ "default: 0). Any priority other than 0 will raise an error "
2432
+ "if the served model does not use priority scheduling."
2433
+ ),
2434
+ )
2435
+ kv_transfer_params: dict[str, Any] | None = Field(
2436
+ default=None,
2437
+ description="KVTransfer parameters used for disaggregated serving.",
2438
+ )
2439
+
2440
+
2441
+ class GenerateResponseChoice(BaseModel):
2442
+ index: int
2443
+ logprobs: ChatCompletionLogProbs | None = None
2444
+ # per OpenAI spec this is the default
2445
+ finish_reason: str | None = "stop"
2446
+ token_ids: list[int] | None = None
2447
+
2448
+
2449
+ class GenerateResponse(BaseModel):
2450
+ request_id: str = Field(
2451
+ default_factory=random_uuid,
2452
+ description=(
2453
+ "The request_id related to this request. If the caller does "
2454
+ "not set it, a random_uuid will be generated. This id is used "
2455
+ "through out the inference process and return in response."
2456
+ ),
2457
+ )
2458
+ choices: list[GenerateResponseChoice]
2459
+
2460
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
2461
+
2462
+ kv_transfer_params: dict[str, Any] | None = Field(
2463
+ default=None,
2464
+ description="KVTransfer parameters used for disaggregated serving.",
2465
+ )