vllm-cpu-avx512vnni 0.13.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1641) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +1260 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +3080 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +443 -0
  16. vllm/attention/backends/registry.py +254 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +969 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +120 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/layers/mm_encoder_attention.py +284 -0
  24. vllm/attention/ops/__init__.py +0 -0
  25. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  26. vllm/attention/ops/common.py +469 -0
  27. vllm/attention/ops/flashmla.py +251 -0
  28. vllm/attention/ops/merge_attn_states.py +47 -0
  29. vllm/attention/ops/paged_attn.py +51 -0
  30. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  31. vllm/attention/ops/prefix_prefill.py +814 -0
  32. vllm/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  33. vllm/attention/ops/triton_decode_attention.py +712 -0
  34. vllm/attention/ops/triton_merge_attn_states.py +116 -0
  35. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  36. vllm/attention/ops/triton_unified_attention.py +1047 -0
  37. vllm/attention/ops/vit_attn_wrappers.py +139 -0
  38. vllm/attention/selector.py +145 -0
  39. vllm/attention/utils/__init__.py +0 -0
  40. vllm/attention/utils/fa_utils.py +118 -0
  41. vllm/attention/utils/kv_sharing_utils.py +33 -0
  42. vllm/attention/utils/kv_transfer_utils.py +60 -0
  43. vllm/beam_search.py +88 -0
  44. vllm/benchmarks/__init__.py +0 -0
  45. vllm/benchmarks/datasets.py +3228 -0
  46. vllm/benchmarks/latency.py +170 -0
  47. vllm/benchmarks/lib/__init__.py +3 -0
  48. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  49. vllm/benchmarks/lib/ready_checker.py +72 -0
  50. vllm/benchmarks/lib/utils.py +79 -0
  51. vllm/benchmarks/serve.py +1538 -0
  52. vllm/benchmarks/startup.py +326 -0
  53. vllm/benchmarks/sweep/__init__.py +0 -0
  54. vllm/benchmarks/sweep/cli.py +41 -0
  55. vllm/benchmarks/sweep/param_sweep.py +158 -0
  56. vllm/benchmarks/sweep/plot.py +675 -0
  57. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  58. vllm/benchmarks/sweep/serve.py +450 -0
  59. vllm/benchmarks/sweep/serve_sla.py +492 -0
  60. vllm/benchmarks/sweep/server.py +114 -0
  61. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  62. vllm/benchmarks/sweep/utils.py +4 -0
  63. vllm/benchmarks/throughput.py +808 -0
  64. vllm/collect_env.py +857 -0
  65. vllm/compilation/__init__.py +0 -0
  66. vllm/compilation/activation_quant_fusion.py +209 -0
  67. vllm/compilation/backends.py +839 -0
  68. vllm/compilation/base_static_graph.py +57 -0
  69. vllm/compilation/caching.py +180 -0
  70. vllm/compilation/collective_fusion.py +1215 -0
  71. vllm/compilation/compiler_interface.py +639 -0
  72. vllm/compilation/counter.py +48 -0
  73. vllm/compilation/cuda_graph.py +302 -0
  74. vllm/compilation/decorators.py +626 -0
  75. vllm/compilation/fix_functionalization.py +266 -0
  76. vllm/compilation/fusion.py +550 -0
  77. vllm/compilation/fusion_attn.py +359 -0
  78. vllm/compilation/fx_utils.py +91 -0
  79. vllm/compilation/inductor_pass.py +138 -0
  80. vllm/compilation/matcher_utils.py +361 -0
  81. vllm/compilation/monitor.py +62 -0
  82. vllm/compilation/noop_elimination.py +130 -0
  83. vllm/compilation/partition_rules.py +72 -0
  84. vllm/compilation/pass_manager.py +155 -0
  85. vllm/compilation/piecewise_backend.py +178 -0
  86. vllm/compilation/post_cleanup.py +21 -0
  87. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  88. vllm/compilation/rocm_aiter_fusion.py +242 -0
  89. vllm/compilation/sequence_parallelism.py +364 -0
  90. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  91. vllm/compilation/vllm_inductor_pass.py +173 -0
  92. vllm/compilation/wrapper.py +319 -0
  93. vllm/config/__init__.py +108 -0
  94. vllm/config/attention.py +114 -0
  95. vllm/config/cache.py +232 -0
  96. vllm/config/compilation.py +1140 -0
  97. vllm/config/device.py +75 -0
  98. vllm/config/ec_transfer.py +110 -0
  99. vllm/config/kv_events.py +56 -0
  100. vllm/config/kv_transfer.py +119 -0
  101. vllm/config/load.py +124 -0
  102. vllm/config/lora.py +96 -0
  103. vllm/config/model.py +2190 -0
  104. vllm/config/multimodal.py +247 -0
  105. vllm/config/observability.py +140 -0
  106. vllm/config/parallel.py +660 -0
  107. vllm/config/pooler.py +126 -0
  108. vllm/config/profiler.py +199 -0
  109. vllm/config/scheduler.py +299 -0
  110. vllm/config/speculative.py +644 -0
  111. vllm/config/speech_to_text.py +38 -0
  112. vllm/config/structured_outputs.py +78 -0
  113. vllm/config/utils.py +370 -0
  114. vllm/config/vllm.py +1434 -0
  115. vllm/connections.py +189 -0
  116. vllm/device_allocator/__init__.py +0 -0
  117. vllm/device_allocator/cumem.py +327 -0
  118. vllm/distributed/__init__.py +6 -0
  119. vllm/distributed/communication_op.py +43 -0
  120. vllm/distributed/device_communicators/__init__.py +0 -0
  121. vllm/distributed/device_communicators/all2all.py +490 -0
  122. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  123. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  124. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  125. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  126. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  127. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  128. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  129. vllm/distributed/device_communicators/pynccl.py +386 -0
  130. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  131. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  132. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  133. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  134. vllm/distributed/device_communicators/shm_broadcast.py +778 -0
  135. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  136. vllm/distributed/device_communicators/symm_mem.py +156 -0
  137. vllm/distributed/device_communicators/tpu_communicator.py +99 -0
  138. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  139. vllm/distributed/ec_transfer/__init__.py +14 -0
  140. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  141. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  142. vllm/distributed/ec_transfer/ec_connector/example_connector.py +201 -0
  143. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  144. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  145. vllm/distributed/eplb/__init__.py +3 -0
  146. vllm/distributed/eplb/async_worker.py +115 -0
  147. vllm/distributed/eplb/eplb_state.py +1164 -0
  148. vllm/distributed/eplb/policy/__init__.py +19 -0
  149. vllm/distributed/eplb/policy/abstract.py +40 -0
  150. vllm/distributed/eplb/policy/default.py +267 -0
  151. vllm/distributed/eplb/rebalance_execute.py +529 -0
  152. vllm/distributed/kv_events.py +499 -0
  153. vllm/distributed/kv_transfer/README.md +29 -0
  154. vllm/distributed/kv_transfer/__init__.py +20 -0
  155. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  156. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  157. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  158. vllm/distributed/kv_transfer/kv_connector/factory.py +197 -0
  159. vllm/distributed/kv_transfer/kv_connector/utils.py +322 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/base.py +597 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +450 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +327 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +378 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1418 -0
  169. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +895 -0
  170. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +186 -0
  171. vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +914 -0
  172. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +464 -0
  173. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2526 -0
  174. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +538 -0
  175. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  176. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  177. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  178. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  179. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  180. vllm/distributed/parallel_state.py +1795 -0
  181. vllm/distributed/tpu_distributed_utils.py +188 -0
  182. vllm/distributed/utils.py +545 -0
  183. vllm/engine/__init__.py +0 -0
  184. vllm/engine/arg_utils.py +2068 -0
  185. vllm/engine/async_llm_engine.py +6 -0
  186. vllm/engine/llm_engine.py +6 -0
  187. vllm/engine/protocol.py +190 -0
  188. vllm/entrypoints/__init__.py +0 -0
  189. vllm/entrypoints/anthropic/__init__.py +0 -0
  190. vllm/entrypoints/anthropic/protocol.py +162 -0
  191. vllm/entrypoints/anthropic/serving_messages.py +468 -0
  192. vllm/entrypoints/api_server.py +185 -0
  193. vllm/entrypoints/chat_utils.py +1903 -0
  194. vllm/entrypoints/cli/__init__.py +15 -0
  195. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  196. vllm/entrypoints/cli/benchmark/base.py +25 -0
  197. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  198. vllm/entrypoints/cli/benchmark/main.py +56 -0
  199. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  200. vllm/entrypoints/cli/benchmark/startup.py +21 -0
  201. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  202. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  203. vllm/entrypoints/cli/collect_env.py +38 -0
  204. vllm/entrypoints/cli/main.py +79 -0
  205. vllm/entrypoints/cli/openai.py +260 -0
  206. vllm/entrypoints/cli/run_batch.py +68 -0
  207. vllm/entrypoints/cli/serve.py +249 -0
  208. vllm/entrypoints/cli/types.py +29 -0
  209. vllm/entrypoints/constants.py +12 -0
  210. vllm/entrypoints/context.py +835 -0
  211. vllm/entrypoints/launcher.py +175 -0
  212. vllm/entrypoints/llm.py +1790 -0
  213. vllm/entrypoints/logger.py +84 -0
  214. vllm/entrypoints/openai/__init__.py +0 -0
  215. vllm/entrypoints/openai/api_server.py +1469 -0
  216. vllm/entrypoints/openai/cli_args.py +302 -0
  217. vllm/entrypoints/openai/orca_metrics.py +120 -0
  218. vllm/entrypoints/openai/parser/__init__.py +0 -0
  219. vllm/entrypoints/openai/parser/harmony_utils.py +825 -0
  220. vllm/entrypoints/openai/parser/responses_parser.py +135 -0
  221. vllm/entrypoints/openai/protocol.py +2496 -0
  222. vllm/entrypoints/openai/run_batch.py +631 -0
  223. vllm/entrypoints/openai/serving_chat.py +1822 -0
  224. vllm/entrypoints/openai/serving_completion.py +729 -0
  225. vllm/entrypoints/openai/serving_engine.py +1542 -0
  226. vllm/entrypoints/openai/serving_models.py +304 -0
  227. vllm/entrypoints/openai/serving_responses.py +2080 -0
  228. vllm/entrypoints/openai/serving_transcription.py +168 -0
  229. vllm/entrypoints/openai/speech_to_text.py +559 -0
  230. vllm/entrypoints/openai/tool_parsers/__init__.py +33 -0
  231. vllm/entrypoints/openai/utils.py +49 -0
  232. vllm/entrypoints/pooling/__init__.py +16 -0
  233. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  234. vllm/entrypoints/pooling/classify/api_router.py +50 -0
  235. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  236. vllm/entrypoints/pooling/classify/serving.py +233 -0
  237. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  238. vllm/entrypoints/pooling/embed/api_router.py +67 -0
  239. vllm/entrypoints/pooling/embed/protocol.py +208 -0
  240. vllm/entrypoints/pooling/embed/serving.py +684 -0
  241. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  242. vllm/entrypoints/pooling/pooling/api_router.py +63 -0
  243. vllm/entrypoints/pooling/pooling/protocol.py +148 -0
  244. vllm/entrypoints/pooling/pooling/serving.py +354 -0
  245. vllm/entrypoints/pooling/score/__init__.py +0 -0
  246. vllm/entrypoints/pooling/score/api_router.py +149 -0
  247. vllm/entrypoints/pooling/score/protocol.py +146 -0
  248. vllm/entrypoints/pooling/score/serving.py +508 -0
  249. vllm/entrypoints/renderer.py +410 -0
  250. vllm/entrypoints/responses_utils.py +249 -0
  251. vllm/entrypoints/sagemaker/__init__.py +4 -0
  252. vllm/entrypoints/sagemaker/routes.py +118 -0
  253. vllm/entrypoints/score_utils.py +237 -0
  254. vllm/entrypoints/serve/__init__.py +60 -0
  255. vllm/entrypoints/serve/disagg/__init__.py +0 -0
  256. vllm/entrypoints/serve/disagg/api_router.py +110 -0
  257. vllm/entrypoints/serve/disagg/protocol.py +90 -0
  258. vllm/entrypoints/serve/disagg/serving.py +285 -0
  259. vllm/entrypoints/serve/elastic_ep/__init__.py +0 -0
  260. vllm/entrypoints/serve/elastic_ep/api_router.py +96 -0
  261. vllm/entrypoints/serve/elastic_ep/middleware.py +49 -0
  262. vllm/entrypoints/serve/instrumentator/__init__.py +0 -0
  263. vllm/entrypoints/serve/instrumentator/health.py +33 -0
  264. vllm/entrypoints/serve/instrumentator/metrics.py +45 -0
  265. vllm/entrypoints/serve/lora/__init__.py +0 -0
  266. vllm/entrypoints/serve/lora/api_router.py +70 -0
  267. vllm/entrypoints/serve/profile/__init__.py +0 -0
  268. vllm/entrypoints/serve/profile/api_router.py +46 -0
  269. vllm/entrypoints/serve/rlhf/__init__.py +0 -0
  270. vllm/entrypoints/serve/rlhf/api_router.py +102 -0
  271. vllm/entrypoints/serve/sleep/__init__.py +0 -0
  272. vllm/entrypoints/serve/sleep/api_router.py +60 -0
  273. vllm/entrypoints/serve/tokenize/__init__.py +0 -0
  274. vllm/entrypoints/serve/tokenize/api_router.py +118 -0
  275. vllm/entrypoints/serve/tokenize/serving.py +204 -0
  276. vllm/entrypoints/ssl.py +78 -0
  277. vllm/entrypoints/tool.py +187 -0
  278. vllm/entrypoints/tool_server.py +234 -0
  279. vllm/entrypoints/utils.py +319 -0
  280. vllm/env_override.py +378 -0
  281. vllm/envs.py +1744 -0
  282. vllm/forward_context.py +358 -0
  283. vllm/inputs/__init__.py +44 -0
  284. vllm/inputs/data.py +359 -0
  285. vllm/inputs/parse.py +146 -0
  286. vllm/inputs/preprocess.py +717 -0
  287. vllm/logger.py +303 -0
  288. vllm/logging_utils/__init__.py +13 -0
  289. vllm/logging_utils/dump_input.py +83 -0
  290. vllm/logging_utils/formatter.py +127 -0
  291. vllm/logging_utils/lazy.py +20 -0
  292. vllm/logging_utils/log_time.py +34 -0
  293. vllm/logits_process.py +121 -0
  294. vllm/logprobs.py +206 -0
  295. vllm/lora/__init__.py +0 -0
  296. vllm/lora/layers/__init__.py +42 -0
  297. vllm/lora/layers/base.py +66 -0
  298. vllm/lora/layers/base_linear.py +165 -0
  299. vllm/lora/layers/column_parallel_linear.py +577 -0
  300. vllm/lora/layers/fused_moe.py +747 -0
  301. vllm/lora/layers/logits_processor.py +203 -0
  302. vllm/lora/layers/replicated_linear.py +70 -0
  303. vllm/lora/layers/row_parallel_linear.py +176 -0
  304. vllm/lora/layers/utils.py +74 -0
  305. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  306. vllm/lora/lora_model.py +246 -0
  307. vllm/lora/lora_weights.py +227 -0
  308. vllm/lora/model_manager.py +690 -0
  309. vllm/lora/ops/__init__.py +0 -0
  310. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  311. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  312. vllm/lora/ops/torch_ops/__init__.py +20 -0
  313. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  314. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  315. vllm/lora/ops/triton_ops/__init__.py +21 -0
  316. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +665 -0
  317. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  318. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  319. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  320. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  321. vllm/lora/ops/triton_ops/utils.py +295 -0
  322. vllm/lora/ops/xla_ops/__init__.py +6 -0
  323. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  324. vllm/lora/peft_helper.py +128 -0
  325. vllm/lora/punica_wrapper/__init__.py +10 -0
  326. vllm/lora/punica_wrapper/punica_base.py +493 -0
  327. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  328. vllm/lora/punica_wrapper/punica_gpu.py +412 -0
  329. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  330. vllm/lora/punica_wrapper/punica_tpu.py +358 -0
  331. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  332. vllm/lora/punica_wrapper/utils.py +150 -0
  333. vllm/lora/request.py +100 -0
  334. vllm/lora/resolver.py +88 -0
  335. vllm/lora/utils.py +315 -0
  336. vllm/lora/worker_manager.py +268 -0
  337. vllm/model_executor/__init__.py +11 -0
  338. vllm/model_executor/custom_op.py +199 -0
  339. vllm/model_executor/layers/__init__.py +0 -0
  340. vllm/model_executor/layers/activation.py +595 -0
  341. vllm/model_executor/layers/attention_layer_base.py +32 -0
  342. vllm/model_executor/layers/batch_invariant.py +1067 -0
  343. vllm/model_executor/layers/conv.py +256 -0
  344. vllm/model_executor/layers/fla/__init__.py +8 -0
  345. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  346. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  347. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  348. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  349. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  350. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  351. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  352. vllm/model_executor/layers/fla/ops/index.py +41 -0
  353. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  354. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  355. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  356. vllm/model_executor/layers/fla/ops/op.py +60 -0
  357. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  358. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  359. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  360. vllm/model_executor/layers/fused_moe/__init__.py +114 -0
  361. vllm/model_executor/layers/fused_moe/all2all_utils.py +171 -0
  362. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +409 -0
  363. vllm/model_executor/layers/fused_moe/config.py +1043 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  638. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  639. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +292 -0
  640. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1453 -0
  641. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +358 -0
  642. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +427 -0
  643. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  644. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +434 -0
  645. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +376 -0
  646. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  647. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  648. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  649. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  650. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +825 -0
  651. vllm/model_executor/layers/fused_moe/fused_moe.py +2223 -0
  652. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +103 -0
  653. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +119 -0
  654. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +524 -0
  655. vllm/model_executor/layers/fused_moe/layer.py +2133 -0
  656. vllm/model_executor/layers/fused_moe/modular_kernel.py +1302 -0
  657. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +192 -0
  658. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  659. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  660. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  661. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  662. vllm/model_executor/layers/fused_moe/prepare_finalize.py +78 -0
  663. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  664. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  665. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  666. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  667. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  668. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  669. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +455 -0
  670. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  671. vllm/model_executor/layers/kda.py +442 -0
  672. vllm/model_executor/layers/layernorm.py +442 -0
  673. vllm/model_executor/layers/lightning_attn.py +735 -0
  674. vllm/model_executor/layers/linear.py +1424 -0
  675. vllm/model_executor/layers/logits_processor.py +106 -0
  676. vllm/model_executor/layers/mamba/__init__.py +0 -0
  677. vllm/model_executor/layers/mamba/abstract.py +68 -0
  678. vllm/model_executor/layers/mamba/linear_attn.py +388 -0
  679. vllm/model_executor/layers/mamba/mamba_mixer.py +526 -0
  680. vllm/model_executor/layers/mamba/mamba_mixer2.py +930 -0
  681. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  682. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  683. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  684. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  685. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +586 -0
  686. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  687. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  688. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  689. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  690. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  691. vllm/model_executor/layers/mamba/short_conv.py +255 -0
  692. vllm/model_executor/layers/mla.py +176 -0
  693. vllm/model_executor/layers/pooler.py +830 -0
  694. vllm/model_executor/layers/quantization/__init__.py +179 -0
  695. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  696. vllm/model_executor/layers/quantization/awq.py +277 -0
  697. vllm/model_executor/layers/quantization/awq_marlin.py +793 -0
  698. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  699. vllm/model_executor/layers/quantization/base_config.py +170 -0
  700. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  701. vllm/model_executor/layers/quantization/bitsandbytes.py +626 -0
  702. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  703. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +986 -0
  704. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2645 -0
  705. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  706. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  707. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  710. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  711. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +176 -0
  712. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  713. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  714. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  715. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  716. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  717. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  718. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  719. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  720. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  721. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  722. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  723. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  724. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  725. vllm/model_executor/layers/quantization/cpu_wna16.py +625 -0
  726. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  727. vllm/model_executor/layers/quantization/experts_int8.py +207 -0
  728. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  729. vllm/model_executor/layers/quantization/fp8.py +1461 -0
  730. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  731. vllm/model_executor/layers/quantization/gguf.py +677 -0
  732. vllm/model_executor/layers/quantization/gptq.py +393 -0
  733. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  734. vllm/model_executor/layers/quantization/gptq_marlin.py +932 -0
  735. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  736. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  737. vllm/model_executor/layers/quantization/inc.py +65 -0
  738. vllm/model_executor/layers/quantization/input_quant_fp8.py +202 -0
  739. vllm/model_executor/layers/quantization/ipex_quant.py +487 -0
  740. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  741. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  742. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +109 -0
  743. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  744. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  745. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  746. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +130 -0
  747. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  748. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  749. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  750. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  751. vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py +97 -0
  752. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +76 -0
  753. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +81 -0
  754. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +128 -0
  755. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +220 -0
  756. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +147 -0
  757. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +71 -0
  758. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +106 -0
  759. vllm/model_executor/layers/quantization/kv_cache.py +153 -0
  760. vllm/model_executor/layers/quantization/modelopt.py +1684 -0
  761. vllm/model_executor/layers/quantization/moe_wna16.py +516 -0
  762. vllm/model_executor/layers/quantization/mxfp4.py +1140 -0
  763. vllm/model_executor/layers/quantization/petit.py +319 -0
  764. vllm/model_executor/layers/quantization/ptpc_fp8.py +136 -0
  765. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  766. vllm/model_executor/layers/quantization/quark/quark.py +527 -0
  767. vllm/model_executor/layers/quantization/quark/quark_moe.py +622 -0
  768. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  769. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  770. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  771. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  772. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  773. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  774. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  775. vllm/model_executor/layers/quantization/rtn.py +621 -0
  776. vllm/model_executor/layers/quantization/schema.py +90 -0
  777. vllm/model_executor/layers/quantization/torchao.py +380 -0
  778. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  779. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  780. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  781. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  996. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  997. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +412 -0
  998. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +312 -0
  999. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1453 -0
  1000. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1001. vllm/model_executor/layers/quantization/utils/int8_utils.py +474 -0
  1002. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1003. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1004. vllm/model_executor/layers/quantization/utils/marlin_utils.py +678 -0
  1005. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +452 -0
  1006. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +381 -0
  1007. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1008. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1009. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +189 -0
  1010. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1011. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1012. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1013. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1014. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1015. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1016. vllm/model_executor/layers/quantization/utils/quant_utils.py +741 -0
  1017. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +519 -0
  1018. vllm/model_executor/layers/resampler.py +283 -0
  1019. vllm/model_executor/layers/rotary_embedding/__init__.py +289 -0
  1020. vllm/model_executor/layers/rotary_embedding/base.py +254 -0
  1021. vllm/model_executor/layers/rotary_embedding/common.py +279 -0
  1022. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1023. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1024. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1025. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1026. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +82 -0
  1027. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1028. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1029. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1030. vllm/model_executor/layers/rotary_embedding/mrope.py +412 -0
  1031. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1032. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1033. vllm/model_executor/layers/rotary_embedding/xdrope.py +160 -0
  1034. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1035. vllm/model_executor/layers/utils.py +251 -0
  1036. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1037. vllm/model_executor/model_loader/__init__.py +150 -0
  1038. vllm/model_executor/model_loader/base_loader.py +57 -0
  1039. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1040. vllm/model_executor/model_loader/default_loader.py +321 -0
  1041. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1042. vllm/model_executor/model_loader/gguf_loader.py +371 -0
  1043. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1044. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1045. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1046. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1047. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1048. vllm/model_executor/model_loader/tpu.py +118 -0
  1049. vllm/model_executor/model_loader/utils.py +292 -0
  1050. vllm/model_executor/model_loader/weight_utils.py +1157 -0
  1051. vllm/model_executor/models/__init__.py +44 -0
  1052. vllm/model_executor/models/adapters.py +522 -0
  1053. vllm/model_executor/models/afmoe.py +696 -0
  1054. vllm/model_executor/models/aimv2.py +248 -0
  1055. vllm/model_executor/models/apertus.py +565 -0
  1056. vllm/model_executor/models/arcee.py +428 -0
  1057. vllm/model_executor/models/arctic.py +633 -0
  1058. vllm/model_executor/models/aria.py +653 -0
  1059. vllm/model_executor/models/audioflamingo3.py +639 -0
  1060. vllm/model_executor/models/aya_vision.py +448 -0
  1061. vllm/model_executor/models/bagel.py +584 -0
  1062. vllm/model_executor/models/baichuan.py +493 -0
  1063. vllm/model_executor/models/bailing_moe.py +642 -0
  1064. vllm/model_executor/models/bamba.py +511 -0
  1065. vllm/model_executor/models/bee.py +157 -0
  1066. vllm/model_executor/models/bert.py +925 -0
  1067. vllm/model_executor/models/bert_with_rope.py +732 -0
  1068. vllm/model_executor/models/blip.py +350 -0
  1069. vllm/model_executor/models/blip2.py +693 -0
  1070. vllm/model_executor/models/bloom.py +390 -0
  1071. vllm/model_executor/models/chameleon.py +1095 -0
  1072. vllm/model_executor/models/chatglm.py +502 -0
  1073. vllm/model_executor/models/clip.py +1004 -0
  1074. vllm/model_executor/models/cohere2_vision.py +470 -0
  1075. vllm/model_executor/models/commandr.py +469 -0
  1076. vllm/model_executor/models/config.py +531 -0
  1077. vllm/model_executor/models/dbrx.py +484 -0
  1078. vllm/model_executor/models/deepencoder.py +676 -0
  1079. vllm/model_executor/models/deepseek_eagle.py +252 -0
  1080. vllm/model_executor/models/deepseek_mtp.py +446 -0
  1081. vllm/model_executor/models/deepseek_ocr.py +591 -0
  1082. vllm/model_executor/models/deepseek_v2.py +1710 -0
  1083. vllm/model_executor/models/deepseek_vl2.py +642 -0
  1084. vllm/model_executor/models/dots1.py +565 -0
  1085. vllm/model_executor/models/dots_ocr.py +821 -0
  1086. vllm/model_executor/models/ernie45.py +53 -0
  1087. vllm/model_executor/models/ernie45_moe.py +754 -0
  1088. vllm/model_executor/models/ernie45_vl.py +1621 -0
  1089. vllm/model_executor/models/ernie45_vl_moe.py +800 -0
  1090. vllm/model_executor/models/ernie_mtp.py +279 -0
  1091. vllm/model_executor/models/exaone.py +524 -0
  1092. vllm/model_executor/models/exaone4.py +516 -0
  1093. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1094. vllm/model_executor/models/falcon.py +543 -0
  1095. vllm/model_executor/models/falcon_h1.py +675 -0
  1096. vllm/model_executor/models/flex_olmo.py +155 -0
  1097. vllm/model_executor/models/fuyu.py +371 -0
  1098. vllm/model_executor/models/gemma.py +425 -0
  1099. vllm/model_executor/models/gemma2.py +435 -0
  1100. vllm/model_executor/models/gemma3.py +507 -0
  1101. vllm/model_executor/models/gemma3_mm.py +664 -0
  1102. vllm/model_executor/models/gemma3n.py +1166 -0
  1103. vllm/model_executor/models/gemma3n_mm.py +810 -0
  1104. vllm/model_executor/models/glm.py +24 -0
  1105. vllm/model_executor/models/glm4.py +295 -0
  1106. vllm/model_executor/models/glm4_1v.py +1808 -0
  1107. vllm/model_executor/models/glm4_moe.py +736 -0
  1108. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1109. vllm/model_executor/models/glm4v.py +783 -0
  1110. vllm/model_executor/models/gpt2.py +397 -0
  1111. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1112. vllm/model_executor/models/gpt_j.py +346 -0
  1113. vllm/model_executor/models/gpt_neox.py +340 -0
  1114. vllm/model_executor/models/gpt_oss.py +744 -0
  1115. vllm/model_executor/models/granite.py +475 -0
  1116. vllm/model_executor/models/granite_speech.py +912 -0
  1117. vllm/model_executor/models/granitemoe.py +560 -0
  1118. vllm/model_executor/models/granitemoehybrid.py +703 -0
  1119. vllm/model_executor/models/granitemoeshared.py +328 -0
  1120. vllm/model_executor/models/gritlm.py +243 -0
  1121. vllm/model_executor/models/grok1.py +554 -0
  1122. vllm/model_executor/models/h2ovl.py +554 -0
  1123. vllm/model_executor/models/hunyuan_v1.py +1040 -0
  1124. vllm/model_executor/models/hunyuan_vision.py +1034 -0
  1125. vllm/model_executor/models/hyperclovax_vision.py +1164 -0
  1126. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1127. vllm/model_executor/models/idefics3.py +716 -0
  1128. vllm/model_executor/models/interfaces.py +1179 -0
  1129. vllm/model_executor/models/interfaces_base.py +228 -0
  1130. vllm/model_executor/models/intern_vit.py +454 -0
  1131. vllm/model_executor/models/internlm2.py +453 -0
  1132. vllm/model_executor/models/internlm2_ve.py +139 -0
  1133. vllm/model_executor/models/interns1.py +828 -0
  1134. vllm/model_executor/models/interns1_vit.py +433 -0
  1135. vllm/model_executor/models/internvl.py +1450 -0
  1136. vllm/model_executor/models/jais.py +397 -0
  1137. vllm/model_executor/models/jais2.py +529 -0
  1138. vllm/model_executor/models/jamba.py +609 -0
  1139. vllm/model_executor/models/jina_vl.py +147 -0
  1140. vllm/model_executor/models/keye.py +1706 -0
  1141. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1142. vllm/model_executor/models/kimi_linear.py +658 -0
  1143. vllm/model_executor/models/kimi_vl.py +576 -0
  1144. vllm/model_executor/models/lfm2.py +515 -0
  1145. vllm/model_executor/models/lfm2_moe.py +745 -0
  1146. vllm/model_executor/models/lightonocr.py +195 -0
  1147. vllm/model_executor/models/llama.py +700 -0
  1148. vllm/model_executor/models/llama4.py +856 -0
  1149. vllm/model_executor/models/llama4_eagle.py +225 -0
  1150. vllm/model_executor/models/llama_eagle.py +213 -0
  1151. vllm/model_executor/models/llama_eagle3.py +375 -0
  1152. vllm/model_executor/models/llava.py +840 -0
  1153. vllm/model_executor/models/llava_next.py +581 -0
  1154. vllm/model_executor/models/llava_next_video.py +465 -0
  1155. vllm/model_executor/models/llava_onevision.py +921 -0
  1156. vllm/model_executor/models/longcat_flash.py +743 -0
  1157. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1158. vllm/model_executor/models/mamba.py +276 -0
  1159. vllm/model_executor/models/mamba2.py +288 -0
  1160. vllm/model_executor/models/medusa.py +179 -0
  1161. vllm/model_executor/models/midashenglm.py +826 -0
  1162. vllm/model_executor/models/mimo.py +188 -0
  1163. vllm/model_executor/models/mimo_mtp.py +294 -0
  1164. vllm/model_executor/models/minicpm.py +656 -0
  1165. vllm/model_executor/models/minicpm3.py +233 -0
  1166. vllm/model_executor/models/minicpm_eagle.py +385 -0
  1167. vllm/model_executor/models/minicpmo.py +768 -0
  1168. vllm/model_executor/models/minicpmv.py +1742 -0
  1169. vllm/model_executor/models/minimax_m2.py +550 -0
  1170. vllm/model_executor/models/minimax_text_01.py +1007 -0
  1171. vllm/model_executor/models/minimax_vl_01.py +394 -0
  1172. vllm/model_executor/models/mistral3.py +635 -0
  1173. vllm/model_executor/models/mistral_large_3.py +63 -0
  1174. vllm/model_executor/models/mistral_large_3_eagle.py +136 -0
  1175. vllm/model_executor/models/mixtral.py +598 -0
  1176. vllm/model_executor/models/mllama4.py +1149 -0
  1177. vllm/model_executor/models/mlp_speculator.py +235 -0
  1178. vllm/model_executor/models/modernbert.py +451 -0
  1179. vllm/model_executor/models/module_mapping.py +74 -0
  1180. vllm/model_executor/models/molmo.py +1550 -0
  1181. vllm/model_executor/models/moonvit.py +686 -0
  1182. vllm/model_executor/models/mpt.py +335 -0
  1183. vllm/model_executor/models/nano_nemotron_vl.py +1730 -0
  1184. vllm/model_executor/models/nemotron.py +499 -0
  1185. vllm/model_executor/models/nemotron_h.py +900 -0
  1186. vllm/model_executor/models/nemotron_nas.py +471 -0
  1187. vllm/model_executor/models/nemotron_vl.py +651 -0
  1188. vllm/model_executor/models/nvlm_d.py +216 -0
  1189. vllm/model_executor/models/olmo.py +412 -0
  1190. vllm/model_executor/models/olmo2.py +454 -0
  1191. vllm/model_executor/models/olmoe.py +493 -0
  1192. vllm/model_executor/models/opencua.py +262 -0
  1193. vllm/model_executor/models/openpangu.py +1049 -0
  1194. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1195. vllm/model_executor/models/opt.py +426 -0
  1196. vllm/model_executor/models/orion.py +365 -0
  1197. vllm/model_executor/models/ouro.py +507 -0
  1198. vllm/model_executor/models/ovis.py +557 -0
  1199. vllm/model_executor/models/ovis2_5.py +661 -0
  1200. vllm/model_executor/models/paddleocr_vl.py +1300 -0
  1201. vllm/model_executor/models/paligemma.py +408 -0
  1202. vllm/model_executor/models/persimmon.py +373 -0
  1203. vllm/model_executor/models/phi.py +363 -0
  1204. vllm/model_executor/models/phi3.py +18 -0
  1205. vllm/model_executor/models/phi3v.py +729 -0
  1206. vllm/model_executor/models/phi4mm.py +1251 -0
  1207. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1208. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1209. vllm/model_executor/models/phimoe.py +669 -0
  1210. vllm/model_executor/models/pixtral.py +1379 -0
  1211. vllm/model_executor/models/plamo2.py +965 -0
  1212. vllm/model_executor/models/plamo3.py +440 -0
  1213. vllm/model_executor/models/qwen.py +365 -0
  1214. vllm/model_executor/models/qwen2.py +600 -0
  1215. vllm/model_executor/models/qwen2_5_omni_thinker.py +1219 -0
  1216. vllm/model_executor/models/qwen2_5_vl.py +1569 -0
  1217. vllm/model_executor/models/qwen2_audio.py +471 -0
  1218. vllm/model_executor/models/qwen2_moe.py +597 -0
  1219. vllm/model_executor/models/qwen2_rm.py +123 -0
  1220. vllm/model_executor/models/qwen2_vl.py +1568 -0
  1221. vllm/model_executor/models/qwen3.py +331 -0
  1222. vllm/model_executor/models/qwen3_moe.py +751 -0
  1223. vllm/model_executor/models/qwen3_next.py +1395 -0
  1224. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1225. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1793 -0
  1226. vllm/model_executor/models/qwen3_vl.py +2092 -0
  1227. vllm/model_executor/models/qwen3_vl_moe.py +474 -0
  1228. vllm/model_executor/models/qwen_vl.py +801 -0
  1229. vllm/model_executor/models/radio.py +555 -0
  1230. vllm/model_executor/models/registry.py +1189 -0
  1231. vllm/model_executor/models/roberta.py +259 -0
  1232. vllm/model_executor/models/rvl.py +107 -0
  1233. vllm/model_executor/models/seed_oss.py +492 -0
  1234. vllm/model_executor/models/siglip.py +1244 -0
  1235. vllm/model_executor/models/siglip2navit.py +658 -0
  1236. vllm/model_executor/models/skyworkr1v.py +951 -0
  1237. vllm/model_executor/models/smolvlm.py +38 -0
  1238. vllm/model_executor/models/solar.py +484 -0
  1239. vllm/model_executor/models/stablelm.py +354 -0
  1240. vllm/model_executor/models/starcoder2.py +365 -0
  1241. vllm/model_executor/models/step3_text.py +554 -0
  1242. vllm/model_executor/models/step3_vl.py +1147 -0
  1243. vllm/model_executor/models/swin.py +514 -0
  1244. vllm/model_executor/models/tarsier.py +617 -0
  1245. vllm/model_executor/models/telechat2.py +153 -0
  1246. vllm/model_executor/models/teleflm.py +78 -0
  1247. vllm/model_executor/models/terratorch.py +318 -0
  1248. vllm/model_executor/models/transformers/__init__.py +127 -0
  1249. vllm/model_executor/models/transformers/base.py +518 -0
  1250. vllm/model_executor/models/transformers/causal.py +65 -0
  1251. vllm/model_executor/models/transformers/legacy.py +90 -0
  1252. vllm/model_executor/models/transformers/moe.py +325 -0
  1253. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1254. vllm/model_executor/models/transformers/pooling.py +119 -0
  1255. vllm/model_executor/models/transformers/utils.py +213 -0
  1256. vllm/model_executor/models/ultravox.py +766 -0
  1257. vllm/model_executor/models/utils.py +832 -0
  1258. vllm/model_executor/models/vision.py +546 -0
  1259. vllm/model_executor/models/voxtral.py +841 -0
  1260. vllm/model_executor/models/whisper.py +971 -0
  1261. vllm/model_executor/models/zamba2.py +979 -0
  1262. vllm/model_executor/parameter.py +642 -0
  1263. vllm/model_executor/utils.py +119 -0
  1264. vllm/model_executor/warmup/__init__.py +0 -0
  1265. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1266. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1267. vllm/multimodal/__init__.py +40 -0
  1268. vllm/multimodal/audio.py +147 -0
  1269. vllm/multimodal/base.py +56 -0
  1270. vllm/multimodal/cache.py +823 -0
  1271. vllm/multimodal/evs.py +294 -0
  1272. vllm/multimodal/hasher.py +120 -0
  1273. vllm/multimodal/image.py +142 -0
  1274. vllm/multimodal/inputs.py +1089 -0
  1275. vllm/multimodal/parse.py +565 -0
  1276. vllm/multimodal/processing.py +2240 -0
  1277. vllm/multimodal/profiling.py +351 -0
  1278. vllm/multimodal/registry.py +357 -0
  1279. vllm/multimodal/utils.py +513 -0
  1280. vllm/multimodal/video.py +340 -0
  1281. vllm/outputs.py +345 -0
  1282. vllm/platforms/__init__.py +277 -0
  1283. vllm/platforms/cpu.py +421 -0
  1284. vllm/platforms/cuda.py +618 -0
  1285. vllm/platforms/interface.py +695 -0
  1286. vllm/platforms/rocm.py +564 -0
  1287. vllm/platforms/tpu.py +295 -0
  1288. vllm/platforms/xpu.py +277 -0
  1289. vllm/plugins/__init__.py +81 -0
  1290. vllm/plugins/io_processors/__init__.py +68 -0
  1291. vllm/plugins/io_processors/interface.py +77 -0
  1292. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1293. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1294. vllm/pooling_params.py +230 -0
  1295. vllm/profiler/__init__.py +0 -0
  1296. vllm/profiler/layerwise_profile.py +392 -0
  1297. vllm/profiler/utils.py +151 -0
  1298. vllm/profiler/wrapper.py +241 -0
  1299. vllm/py.typed +2 -0
  1300. vllm/ray/__init__.py +0 -0
  1301. vllm/ray/lazy_utils.py +30 -0
  1302. vllm/ray/ray_env.py +79 -0
  1303. vllm/reasoning/__init__.py +96 -0
  1304. vllm/reasoning/abs_reasoning_parsers.py +318 -0
  1305. vllm/reasoning/basic_parsers.py +175 -0
  1306. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1307. vllm/reasoning/deepseek_v3_reasoning_parser.py +67 -0
  1308. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1309. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1310. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1311. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1312. vllm/reasoning/holo2_reasoning_parser.py +88 -0
  1313. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1314. vllm/reasoning/identity_reasoning_parser.py +63 -0
  1315. vllm/reasoning/minimax_m2_reasoning_parser.py +110 -0
  1316. vllm/reasoning/mistral_reasoning_parser.py +154 -0
  1317. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1318. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1319. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1320. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1321. vllm/sampling_params.py +597 -0
  1322. vllm/scalar_type.py +355 -0
  1323. vllm/scripts.py +17 -0
  1324. vllm/sequence.py +98 -0
  1325. vllm/tasks.py +13 -0
  1326. vllm/third_party/__init__.py +0 -0
  1327. vllm/third_party/pynvml.py +6140 -0
  1328. vllm/tokenizers/__init__.py +20 -0
  1329. vllm/tokenizers/deepseek_v32.py +175 -0
  1330. vllm/tokenizers/deepseek_v32_encoding.py +459 -0
  1331. vllm/tokenizers/detokenizer_utils.py +198 -0
  1332. vllm/tokenizers/hf.py +119 -0
  1333. vllm/tokenizers/mistral.py +567 -0
  1334. vllm/tokenizers/protocol.py +114 -0
  1335. vllm/tokenizers/registry.py +233 -0
  1336. vllm/tool_parsers/__init__.py +150 -0
  1337. vllm/tool_parsers/abstract_tool_parser.py +273 -0
  1338. vllm/tool_parsers/deepseekv31_tool_parser.py +388 -0
  1339. vllm/tool_parsers/deepseekv32_tool_parser.py +591 -0
  1340. vllm/tool_parsers/deepseekv3_tool_parser.py +390 -0
  1341. vllm/tool_parsers/ernie45_tool_parser.py +210 -0
  1342. vllm/tool_parsers/gigachat3_tool_parser.py +190 -0
  1343. vllm/tool_parsers/glm4_moe_tool_parser.py +200 -0
  1344. vllm/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  1345. vllm/tool_parsers/granite_tool_parser.py +253 -0
  1346. vllm/tool_parsers/hermes_tool_parser.py +495 -0
  1347. vllm/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  1348. vllm/tool_parsers/internlm2_tool_parser.py +227 -0
  1349. vllm/tool_parsers/jamba_tool_parser.py +323 -0
  1350. vllm/tool_parsers/kimi_k2_tool_parser.py +590 -0
  1351. vllm/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  1352. vllm/tool_parsers/llama_tool_parser.py +324 -0
  1353. vllm/tool_parsers/longcat_tool_parser.py +37 -0
  1354. vllm/tool_parsers/minimax_m2_tool_parser.py +643 -0
  1355. vllm/tool_parsers/minimax_tool_parser.py +849 -0
  1356. vllm/tool_parsers/mistral_tool_parser.py +585 -0
  1357. vllm/tool_parsers/olmo3_tool_parser.py +366 -0
  1358. vllm/tool_parsers/openai_tool_parser.py +102 -0
  1359. vllm/tool_parsers/phi4mini_tool_parser.py +120 -0
  1360. vllm/tool_parsers/pythonic_tool_parser.py +332 -0
  1361. vllm/tool_parsers/qwen3coder_tool_parser.py +781 -0
  1362. vllm/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  1363. vllm/tool_parsers/seed_oss_tool_parser.py +744 -0
  1364. vllm/tool_parsers/step3_tool_parser.py +303 -0
  1365. vllm/tool_parsers/utils.py +229 -0
  1366. vllm/tool_parsers/xlam_tool_parser.py +556 -0
  1367. vllm/tracing.py +135 -0
  1368. vllm/transformers_utils/__init__.py +26 -0
  1369. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1370. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1371. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1372. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1373. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1374. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1375. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1376. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1377. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1378. vllm/transformers_utils/config.py +1144 -0
  1379. vllm/transformers_utils/config_parser_base.py +20 -0
  1380. vllm/transformers_utils/configs/__init__.py +102 -0
  1381. vllm/transformers_utils/configs/afmoe.py +87 -0
  1382. vllm/transformers_utils/configs/arctic.py +216 -0
  1383. vllm/transformers_utils/configs/bagel.py +53 -0
  1384. vllm/transformers_utils/configs/chatglm.py +75 -0
  1385. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1386. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1387. vllm/transformers_utils/configs/eagle.py +90 -0
  1388. vllm/transformers_utils/configs/falcon.py +89 -0
  1389. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1390. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1391. vllm/transformers_utils/configs/jais.py +243 -0
  1392. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1393. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1394. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1395. vllm/transformers_utils/configs/medusa.py +65 -0
  1396. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1397. vllm/transformers_utils/configs/mistral.py +235 -0
  1398. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1399. vllm/transformers_utils/configs/moonvit.py +33 -0
  1400. vllm/transformers_utils/configs/nemotron.py +220 -0
  1401. vllm/transformers_utils/configs/nemotron_h.py +284 -0
  1402. vllm/transformers_utils/configs/olmo3.py +83 -0
  1403. vllm/transformers_utils/configs/ovis.py +182 -0
  1404. vllm/transformers_utils/configs/qwen3_next.py +277 -0
  1405. vllm/transformers_utils/configs/radio.py +89 -0
  1406. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1407. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1408. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1409. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1410. vllm/transformers_utils/configs/tarsier2.py +24 -0
  1411. vllm/transformers_utils/configs/ultravox.py +120 -0
  1412. vllm/transformers_utils/dynamic_module.py +59 -0
  1413. vllm/transformers_utils/gguf_utils.py +280 -0
  1414. vllm/transformers_utils/processor.py +424 -0
  1415. vllm/transformers_utils/processors/__init__.py +25 -0
  1416. vllm/transformers_utils/processors/bagel.py +73 -0
  1417. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1418. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1419. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1420. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1421. vllm/transformers_utils/processors/ovis.py +453 -0
  1422. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1423. vllm/transformers_utils/repo_utils.py +287 -0
  1424. vllm/transformers_utils/runai_utils.py +102 -0
  1425. vllm/transformers_utils/s3_utils.py +95 -0
  1426. vllm/transformers_utils/tokenizer.py +127 -0
  1427. vllm/transformers_utils/tokenizer_base.py +33 -0
  1428. vllm/transformers_utils/utils.py +112 -0
  1429. vllm/triton_utils/__init__.py +20 -0
  1430. vllm/triton_utils/importing.py +103 -0
  1431. vllm/usage/__init__.py +0 -0
  1432. vllm/usage/usage_lib.py +294 -0
  1433. vllm/utils/__init__.py +66 -0
  1434. vllm/utils/argparse_utils.py +492 -0
  1435. vllm/utils/async_utils.py +310 -0
  1436. vllm/utils/cache.py +214 -0
  1437. vllm/utils/collection_utils.py +112 -0
  1438. vllm/utils/counter.py +45 -0
  1439. vllm/utils/deep_gemm.py +400 -0
  1440. vllm/utils/flashinfer.py +528 -0
  1441. vllm/utils/func_utils.py +236 -0
  1442. vllm/utils/gc_utils.py +151 -0
  1443. vllm/utils/hashing.py +117 -0
  1444. vllm/utils/import_utils.py +449 -0
  1445. vllm/utils/jsontree.py +158 -0
  1446. vllm/utils/math_utils.py +32 -0
  1447. vllm/utils/mem_constants.py +13 -0
  1448. vllm/utils/mem_utils.py +232 -0
  1449. vllm/utils/nccl.py +64 -0
  1450. vllm/utils/network_utils.py +331 -0
  1451. vllm/utils/nvtx_pytorch_hooks.py +286 -0
  1452. vllm/utils/platform_utils.py +59 -0
  1453. vllm/utils/profiling.py +56 -0
  1454. vllm/utils/registry.py +51 -0
  1455. vllm/utils/serial_utils.py +214 -0
  1456. vllm/utils/system_utils.py +269 -0
  1457. vllm/utils/tensor_schema.py +255 -0
  1458. vllm/utils/torch_utils.py +648 -0
  1459. vllm/v1/__init__.py +0 -0
  1460. vllm/v1/attention/__init__.py +0 -0
  1461. vllm/v1/attention/backends/__init__.py +0 -0
  1462. vllm/v1/attention/backends/cpu_attn.py +497 -0
  1463. vllm/v1/attention/backends/flash_attn.py +1051 -0
  1464. vllm/v1/attention/backends/flashinfer.py +1575 -0
  1465. vllm/v1/attention/backends/flex_attention.py +1028 -0
  1466. vllm/v1/attention/backends/gdn_attn.py +375 -0
  1467. vllm/v1/attention/backends/linear_attn.py +77 -0
  1468. vllm/v1/attention/backends/mamba1_attn.py +159 -0
  1469. vllm/v1/attention/backends/mamba2_attn.py +348 -0
  1470. vllm/v1/attention/backends/mamba_attn.py +117 -0
  1471. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1472. vllm/v1/attention/backends/mla/aiter_triton_mla.py +74 -0
  1473. vllm/v1/attention/backends/mla/common.py +2114 -0
  1474. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1475. vllm/v1/attention/backends/mla/flashattn_mla.py +342 -0
  1476. vllm/v1/attention/backends/mla/flashinfer_mla.py +174 -0
  1477. vllm/v1/attention/backends/mla/flashmla.py +317 -0
  1478. vllm/v1/attention/backends/mla/flashmla_sparse.py +1020 -0
  1479. vllm/v1/attention/backends/mla/indexer.py +345 -0
  1480. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +275 -0
  1481. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +325 -0
  1482. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1483. vllm/v1/attention/backends/pallas.py +436 -0
  1484. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1485. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1486. vllm/v1/attention/backends/rocm_attn.py +359 -0
  1487. vllm/v1/attention/backends/short_conv_attn.py +104 -0
  1488. vllm/v1/attention/backends/tree_attn.py +428 -0
  1489. vllm/v1/attention/backends/triton_attn.py +497 -0
  1490. vllm/v1/attention/backends/utils.py +1212 -0
  1491. vllm/v1/core/__init__.py +0 -0
  1492. vllm/v1/core/block_pool.py +485 -0
  1493. vllm/v1/core/encoder_cache_manager.py +402 -0
  1494. vllm/v1/core/kv_cache_coordinator.py +570 -0
  1495. vllm/v1/core/kv_cache_manager.py +419 -0
  1496. vllm/v1/core/kv_cache_metrics.py +96 -0
  1497. vllm/v1/core/kv_cache_utils.py +1476 -0
  1498. vllm/v1/core/sched/__init__.py +0 -0
  1499. vllm/v1/core/sched/async_scheduler.py +68 -0
  1500. vllm/v1/core/sched/interface.py +189 -0
  1501. vllm/v1/core/sched/output.py +230 -0
  1502. vllm/v1/core/sched/request_queue.py +217 -0
  1503. vllm/v1/core/sched/scheduler.py +1826 -0
  1504. vllm/v1/core/sched/utils.py +64 -0
  1505. vllm/v1/core/single_type_kv_cache_manager.py +801 -0
  1506. vllm/v1/cudagraph_dispatcher.py +183 -0
  1507. vllm/v1/engine/__init__.py +217 -0
  1508. vllm/v1/engine/async_llm.py +866 -0
  1509. vllm/v1/engine/coordinator.py +377 -0
  1510. vllm/v1/engine/core.py +1455 -0
  1511. vllm/v1/engine/core_client.py +1416 -0
  1512. vllm/v1/engine/detokenizer.py +351 -0
  1513. vllm/v1/engine/exceptions.py +18 -0
  1514. vllm/v1/engine/input_processor.py +643 -0
  1515. vllm/v1/engine/llm_engine.py +414 -0
  1516. vllm/v1/engine/logprobs.py +189 -0
  1517. vllm/v1/engine/output_processor.py +659 -0
  1518. vllm/v1/engine/parallel_sampling.py +145 -0
  1519. vllm/v1/engine/processor.py +20 -0
  1520. vllm/v1/engine/utils.py +1068 -0
  1521. vllm/v1/executor/__init__.py +6 -0
  1522. vllm/v1/executor/abstract.py +352 -0
  1523. vllm/v1/executor/multiproc_executor.py +890 -0
  1524. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1525. vllm/v1/executor/ray_executor.py +626 -0
  1526. vllm/v1/executor/ray_utils.py +465 -0
  1527. vllm/v1/executor/uniproc_executor.py +186 -0
  1528. vllm/v1/kv_cache_interface.py +404 -0
  1529. vllm/v1/kv_offload/__init__.py +0 -0
  1530. vllm/v1/kv_offload/abstract.py +161 -0
  1531. vllm/v1/kv_offload/arc_manager.py +237 -0
  1532. vllm/v1/kv_offload/backend.py +97 -0
  1533. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1534. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1535. vllm/v1/kv_offload/cpu.py +86 -0
  1536. vllm/v1/kv_offload/factory.py +56 -0
  1537. vllm/v1/kv_offload/lru_manager.py +139 -0
  1538. vllm/v1/kv_offload/mediums.py +39 -0
  1539. vllm/v1/kv_offload/spec.py +66 -0
  1540. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1541. vllm/v1/kv_offload/worker/cpu_gpu.py +280 -0
  1542. vllm/v1/kv_offload/worker/worker.py +144 -0
  1543. vllm/v1/metrics/__init__.py +0 -0
  1544. vllm/v1/metrics/loggers.py +1305 -0
  1545. vllm/v1/metrics/prometheus.py +82 -0
  1546. vllm/v1/metrics/ray_wrappers.py +194 -0
  1547. vllm/v1/metrics/reader.py +257 -0
  1548. vllm/v1/metrics/stats.py +437 -0
  1549. vllm/v1/outputs.py +245 -0
  1550. vllm/v1/pool/__init__.py +0 -0
  1551. vllm/v1/pool/metadata.py +126 -0
  1552. vllm/v1/request.py +282 -0
  1553. vllm/v1/sample/__init__.py +0 -0
  1554. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1555. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1556. vllm/v1/sample/logits_processor/interface.py +106 -0
  1557. vllm/v1/sample/logits_processor/state.py +165 -0
  1558. vllm/v1/sample/metadata.py +44 -0
  1559. vllm/v1/sample/ops/__init__.py +0 -0
  1560. vllm/v1/sample/ops/bad_words.py +52 -0
  1561. vllm/v1/sample/ops/logprobs.py +25 -0
  1562. vllm/v1/sample/ops/penalties.py +57 -0
  1563. vllm/v1/sample/ops/topk_topp_sampler.py +384 -0
  1564. vllm/v1/sample/rejection_sampler.py +805 -0
  1565. vllm/v1/sample/sampler.py +319 -0
  1566. vllm/v1/sample/tpu/__init__.py +0 -0
  1567. vllm/v1/sample/tpu/metadata.py +120 -0
  1568. vllm/v1/sample/tpu/sampler.py +215 -0
  1569. vllm/v1/serial_utils.py +514 -0
  1570. vllm/v1/spec_decode/__init__.py +0 -0
  1571. vllm/v1/spec_decode/eagle.py +1331 -0
  1572. vllm/v1/spec_decode/medusa.py +73 -0
  1573. vllm/v1/spec_decode/metadata.py +66 -0
  1574. vllm/v1/spec_decode/metrics.py +225 -0
  1575. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1576. vllm/v1/spec_decode/suffix_decoding.py +101 -0
  1577. vllm/v1/spec_decode/utils.py +121 -0
  1578. vllm/v1/structured_output/__init__.py +353 -0
  1579. vllm/v1/structured_output/backend_guidance.py +265 -0
  1580. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1581. vllm/v1/structured_output/backend_outlines.py +324 -0
  1582. vllm/v1/structured_output/backend_types.py +136 -0
  1583. vllm/v1/structured_output/backend_xgrammar.py +378 -0
  1584. vllm/v1/structured_output/request.py +94 -0
  1585. vllm/v1/structured_output/utils.py +469 -0
  1586. vllm/v1/utils.py +414 -0
  1587. vllm/v1/worker/__init__.py +0 -0
  1588. vllm/v1/worker/block_table.py +343 -0
  1589. vllm/v1/worker/cp_utils.py +42 -0
  1590. vllm/v1/worker/cpu_model_runner.py +122 -0
  1591. vllm/v1/worker/cpu_worker.py +192 -0
  1592. vllm/v1/worker/dp_utils.py +240 -0
  1593. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1594. vllm/v1/worker/gpu/README.md +4 -0
  1595. vllm/v1/worker/gpu/__init__.py +0 -0
  1596. vllm/v1/worker/gpu/async_utils.py +98 -0
  1597. vllm/v1/worker/gpu/attn_utils.py +189 -0
  1598. vllm/v1/worker/gpu/block_table.py +314 -0
  1599. vllm/v1/worker/gpu/cudagraph_utils.py +259 -0
  1600. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1601. vllm/v1/worker/gpu/input_batch.py +479 -0
  1602. vllm/v1/worker/gpu/metrics/__init__.py +0 -0
  1603. vllm/v1/worker/gpu/metrics/logits.py +42 -0
  1604. vllm/v1/worker/gpu/model_runner.py +1006 -0
  1605. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1606. vllm/v1/worker/gpu/sample/gumbel.py +101 -0
  1607. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1608. vllm/v1/worker/gpu/sample/metadata.py +192 -0
  1609. vllm/v1/worker/gpu/sample/min_p.py +51 -0
  1610. vllm/v1/worker/gpu/sample/output.py +14 -0
  1611. vllm/v1/worker/gpu/sample/penalties.py +155 -0
  1612. vllm/v1/worker/gpu/sample/sampler.py +87 -0
  1613. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1614. vllm/v1/worker/gpu/spec_decode/eagle.py +565 -0
  1615. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1616. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +71 -0
  1617. vllm/v1/worker/gpu/states.py +316 -0
  1618. vllm/v1/worker/gpu/structured_outputs.py +76 -0
  1619. vllm/v1/worker/gpu_input_batch.py +990 -0
  1620. vllm/v1/worker/gpu_model_runner.py +5470 -0
  1621. vllm/v1/worker/gpu_ubatch_wrapper.py +472 -0
  1622. vllm/v1/worker/gpu_worker.py +955 -0
  1623. vllm/v1/worker/kv_connector_model_runner_mixin.py +302 -0
  1624. vllm/v1/worker/lora_model_runner_mixin.py +212 -0
  1625. vllm/v1/worker/tpu_input_batch.py +583 -0
  1626. vllm/v1/worker/tpu_model_runner.py +2191 -0
  1627. vllm/v1/worker/tpu_worker.py +352 -0
  1628. vllm/v1/worker/ubatch_utils.py +109 -0
  1629. vllm/v1/worker/ubatching.py +231 -0
  1630. vllm/v1/worker/utils.py +375 -0
  1631. vllm/v1/worker/worker_base.py +377 -0
  1632. vllm/v1/worker/workspace.py +253 -0
  1633. vllm/v1/worker/xpu_model_runner.py +48 -0
  1634. vllm/v1/worker/xpu_worker.py +174 -0
  1635. vllm/version.py +39 -0
  1636. vllm/vllm_flash_attn/.gitkeep +0 -0
  1637. vllm_cpu_avx512vnni-0.13.0.dist-info/METADATA +339 -0
  1638. vllm_cpu_avx512vnni-0.13.0.dist-info/RECORD +1641 -0
  1639. vllm_cpu_avx512vnni-0.13.0.dist-info/WHEEL +5 -0
  1640. vllm_cpu_avx512vnni-0.13.0.dist-info/entry_points.txt +5 -0
  1641. vllm_cpu_avx512vnni-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2496 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Adapted from
5
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
6
+ import json
7
+ import time
8
+ from http import HTTPStatus
9
+ from typing import Annotated, Any, ClassVar, Literal, TypeAlias
10
+
11
+ import regex as re
12
+ import torch
13
+ from fastapi import HTTPException, UploadFile
14
+ from openai.types.chat.chat_completion_audio import (
15
+ ChatCompletionAudio as OpenAIChatCompletionAudio,
16
+ )
17
+ from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
18
+ from openai.types.responses import (
19
+ ResponseCodeInterpreterCallCodeDeltaEvent,
20
+ ResponseCodeInterpreterCallCodeDoneEvent,
21
+ ResponseCodeInterpreterCallCompletedEvent,
22
+ ResponseCodeInterpreterCallInProgressEvent,
23
+ ResponseCodeInterpreterCallInterpretingEvent,
24
+ ResponseContentPartAddedEvent,
25
+ ResponseContentPartDoneEvent,
26
+ ResponseFunctionToolCall,
27
+ ResponseInputItemParam,
28
+ ResponseMcpCallArgumentsDeltaEvent,
29
+ ResponseMcpCallArgumentsDoneEvent,
30
+ ResponseMcpCallCompletedEvent,
31
+ ResponseMcpCallInProgressEvent,
32
+ ResponseOutputItem,
33
+ ResponseOutputItemAddedEvent,
34
+ ResponseOutputItemDoneEvent,
35
+ ResponsePrompt,
36
+ ResponseReasoningTextDeltaEvent,
37
+ ResponseReasoningTextDoneEvent,
38
+ ResponseStatus,
39
+ ResponseWebSearchCallCompletedEvent,
40
+ ResponseWebSearchCallInProgressEvent,
41
+ ResponseWebSearchCallSearchingEvent,
42
+ )
43
+ from openai.types.responses import (
44
+ ResponseCompletedEvent as OpenAIResponseCompletedEvent,
45
+ )
46
+ from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreatedEvent
47
+ from openai.types.responses import (
48
+ ResponseInProgressEvent as OpenAIResponseInProgressEvent,
49
+ )
50
+ from openai.types.responses.response_reasoning_item import (
51
+ Content as ResponseReasoningTextContent,
52
+ )
53
+ from openai_harmony import Message as OpenAIHarmonyMessage
54
+
55
+ # Backward compatibility for OpenAI client versions
56
+ try: # For older openai versions (< 1.100.0)
57
+ from openai.types.responses import ResponseTextConfig
58
+ except ImportError: # For newer openai versions (>= 1.100.0)
59
+ from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig
60
+
61
+
62
+ from openai.types.responses.response import IncompleteDetails, ToolChoice
63
+ from openai.types.responses.tool import Tool
64
+ from openai.types.shared import Metadata, Reasoning
65
+ from pydantic import (
66
+ BaseModel,
67
+ ConfigDict,
68
+ Field,
69
+ ValidationError,
70
+ field_serializer,
71
+ model_validator,
72
+ )
73
+
74
+ from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
75
+ from vllm.logger import init_logger
76
+ from vllm.logprobs import Logprob
77
+ from vllm.sampling_params import (
78
+ BeamSearchParams,
79
+ RequestOutputKind,
80
+ SamplingParams,
81
+ StructuredOutputsParams,
82
+ )
83
+ from vllm.utils import random_uuid
84
+ from vllm.utils.import_utils import resolve_obj_by_qualname
85
+
86
+ logger = init_logger(__name__)
87
+
88
+ _LONG_INFO = torch.iinfo(torch.long)
89
+
90
+
91
+ class OpenAIBaseModel(BaseModel):
92
+ # OpenAI API does allow extra fields
93
+ model_config = ConfigDict(extra="allow")
94
+
95
+ # Cache class field names
96
+ field_names: ClassVar[set[str] | None] = None
97
+
98
+ @model_validator(mode="wrap")
99
+ @classmethod
100
+ def __log_extra_fields__(cls, data, handler):
101
+ result = handler(data)
102
+ if not isinstance(data, dict):
103
+ return result
104
+ field_names = cls.field_names
105
+ if field_names is None:
106
+ # Get all class field names and their potential aliases
107
+ field_names = set()
108
+ for field_name, field in cls.model_fields.items():
109
+ field_names.add(field_name)
110
+ if alias := getattr(field, "alias", None):
111
+ field_names.add(alias)
112
+ cls.field_names = field_names
113
+
114
+ # Compare against both field names and aliases
115
+ if any(k not in field_names for k in data):
116
+ logger.warning(
117
+ "The following fields were present in the request but ignored: %s",
118
+ data.keys() - field_names,
119
+ )
120
+ return result
121
+
122
+
123
+ class ErrorInfo(OpenAIBaseModel):
124
+ message: str
125
+ type: str
126
+ param: str | None = None
127
+ code: int
128
+
129
+
130
+ class ErrorResponse(OpenAIBaseModel):
131
+ error: ErrorInfo
132
+
133
+
134
+ class ModelPermission(OpenAIBaseModel):
135
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
136
+ object: str = "model_permission"
137
+ created: int = Field(default_factory=lambda: int(time.time()))
138
+ allow_create_engine: bool = False
139
+ allow_sampling: bool = True
140
+ allow_logprobs: bool = True
141
+ allow_search_indices: bool = False
142
+ allow_view: bool = True
143
+ allow_fine_tuning: bool = False
144
+ organization: str = "*"
145
+ group: str | None = None
146
+ is_blocking: bool = False
147
+
148
+
149
+ class ModelCard(OpenAIBaseModel):
150
+ id: str
151
+ object: str = "model"
152
+ created: int = Field(default_factory=lambda: int(time.time()))
153
+ owned_by: str = "vllm"
154
+ root: str | None = None
155
+ parent: str | None = None
156
+ max_model_len: int | None = None
157
+ permission: list[ModelPermission] = Field(default_factory=list)
158
+
159
+
160
+ class ModelList(OpenAIBaseModel):
161
+ object: str = "list"
162
+ data: list[ModelCard] = Field(default_factory=list)
163
+
164
+
165
+ class PromptTokenUsageInfo(OpenAIBaseModel):
166
+ cached_tokens: int | None = None
167
+
168
+
169
+ class UsageInfo(OpenAIBaseModel):
170
+ prompt_tokens: int = 0
171
+ total_tokens: int = 0
172
+ completion_tokens: int | None = 0
173
+ prompt_tokens_details: PromptTokenUsageInfo | None = None
174
+
175
+
176
+ class RequestResponseMetadata(BaseModel):
177
+ request_id: str
178
+ final_usage_info: UsageInfo | None = None
179
+
180
+
181
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
182
+ name: str
183
+ description: str | None = None
184
+ # schema is the field in openai but that causes conflicts with pydantic so
185
+ # instead use json_schema with an alias
186
+ json_schema: dict[str, Any] | None = Field(default=None, alias="schema")
187
+ strict: bool | None = None
188
+
189
+
190
+ class LegacyStructuralTag(OpenAIBaseModel):
191
+ begin: str
192
+ # schema is the field, but that causes conflicts with pydantic so
193
+ # instead use structural_tag_schema with an alias
194
+ structural_tag_schema: dict[str, Any] | None = Field(default=None, alias="schema")
195
+ end: str
196
+
197
+
198
+ class LegacyStructuralTagResponseFormat(OpenAIBaseModel):
199
+ type: Literal["structural_tag"]
200
+ structures: list[LegacyStructuralTag]
201
+ triggers: list[str]
202
+
203
+
204
+ class StructuralTagResponseFormat(OpenAIBaseModel):
205
+ type: Literal["structural_tag"]
206
+ format: Any
207
+
208
+
209
+ AnyStructuralTagResponseFormat: TypeAlias = (
210
+ LegacyStructuralTagResponseFormat | StructuralTagResponseFormat
211
+ )
212
+
213
+
214
+ class ResponseFormat(OpenAIBaseModel):
215
+ # type must be "json_schema", "json_object", or "text"
216
+ type: Literal["text", "json_object", "json_schema"]
217
+ json_schema: JsonSchemaResponseFormat | None = None
218
+
219
+
220
+ AnyResponseFormat: TypeAlias = (
221
+ ResponseFormat | StructuralTagResponseFormat | LegacyStructuralTagResponseFormat
222
+ )
223
+
224
+
225
+ class StreamOptions(OpenAIBaseModel):
226
+ include_usage: bool | None = True
227
+ continuous_usage_stats: bool | None = False
228
+
229
+
230
+ class FunctionDefinition(OpenAIBaseModel):
231
+ name: str
232
+ description: str | None = None
233
+ parameters: dict[str, Any] | None = None
234
+
235
+
236
+ class ChatCompletionToolsParam(OpenAIBaseModel):
237
+ type: Literal["function"] = "function"
238
+ function: FunctionDefinition
239
+
240
+
241
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
242
+ name: str
243
+
244
+
245
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
246
+ function: ChatCompletionNamedFunction
247
+ type: Literal["function"] = "function"
248
+
249
+
250
+ # extra="forbid" is a workaround to have kwargs as a field,
251
+ # see https://github.com/pydantic/pydantic/issues/3125
252
+ class LogitsProcessorConstructor(BaseModel):
253
+ qualname: str
254
+ args: list[Any] | None = None
255
+ kwargs: dict[str, Any] | None = None
256
+
257
+ model_config = ConfigDict(extra="forbid")
258
+
259
+
260
+ LogitsProcessors = list[str | LogitsProcessorConstructor]
261
+
262
+
263
+ def get_logits_processors(
264
+ processors: LogitsProcessors | None, pattern: str | None
265
+ ) -> list[Any] | None:
266
+ if processors and pattern:
267
+ logits_processors = []
268
+ for processor in processors:
269
+ qualname = processor if isinstance(processor, str) else processor.qualname
270
+ if not re.match(pattern, qualname):
271
+ raise ValueError(
272
+ f"Logits processor '{qualname}' is not allowed by this "
273
+ "server. See --logits-processor-pattern engine argument "
274
+ "for more information."
275
+ )
276
+ try:
277
+ logits_processor = resolve_obj_by_qualname(qualname)
278
+ except Exception as e:
279
+ raise ValueError(
280
+ f"Logits processor '{qualname}' could not be resolved: {e}"
281
+ ) from e
282
+ if isinstance(processor, LogitsProcessorConstructor):
283
+ logits_processor = logits_processor(
284
+ *processor.args or [], **processor.kwargs or {}
285
+ )
286
+ logits_processors.append(logits_processor)
287
+ return logits_processors
288
+ elif processors:
289
+ raise ValueError(
290
+ "The `logits_processors` argument is not supported by this "
291
+ "server. See --logits-processor-pattern engine argument "
292
+ "for more information."
293
+ )
294
+ return None
295
+
296
+
297
+ ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem
298
+
299
+
300
+ class ResponsesRequest(OpenAIBaseModel):
301
+ # Ordered by official OpenAI API documentation
302
+ # https://platform.openai.com/docs/api-reference/responses/create
303
+ background: bool | None = False
304
+ include: (
305
+ list[
306
+ Literal[
307
+ "code_interpreter_call.outputs",
308
+ "computer_call_output.output.image_url",
309
+ "file_search_call.results",
310
+ "message.input_image.image_url",
311
+ "message.output_text.logprobs",
312
+ "reasoning.encrypted_content",
313
+ ],
314
+ ]
315
+ | None
316
+ ) = None
317
+ input: str | list[ResponseInputOutputItem]
318
+ instructions: str | None = None
319
+ max_output_tokens: int | None = None
320
+ max_tool_calls: int | None = None
321
+ metadata: Metadata | None = None
322
+ model: str | None = None
323
+ logit_bias: dict[str, float] | None = None
324
+ parallel_tool_calls: bool | None = True
325
+ previous_response_id: str | None = None
326
+ prompt: ResponsePrompt | None = None
327
+ reasoning: Reasoning | None = None
328
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
329
+ store: bool | None = True
330
+ stream: bool | None = False
331
+ temperature: float | None = None
332
+ text: ResponseTextConfig | None = None
333
+ tool_choice: ToolChoice = "auto"
334
+ tools: list[Tool] = Field(default_factory=list)
335
+ top_logprobs: int | None = 0
336
+ top_p: float | None = None
337
+ top_k: int | None = None
338
+ truncation: Literal["auto", "disabled"] | None = "disabled"
339
+ user: str | None = None
340
+
341
+ # --8<-- [start:responses-extra-params]
342
+ request_id: str = Field(
343
+ default_factory=lambda: f"resp_{random_uuid()}",
344
+ description=(
345
+ "The request_id related to this request. If the caller does "
346
+ "not set it, a random_uuid will be generated. This id is used "
347
+ "through out the inference process and return in response."
348
+ ),
349
+ )
350
+ mm_processor_kwargs: dict[str, Any] | None = Field(
351
+ default=None,
352
+ description=("Additional kwargs to pass to the HF processor."),
353
+ )
354
+ priority: int = Field(
355
+ default=0,
356
+ description=(
357
+ "The priority of the request (lower means earlier handling; "
358
+ "default: 0). Any priority other than 0 will raise an error "
359
+ "if the served model does not use priority scheduling."
360
+ ),
361
+ )
362
+ cache_salt: str | None = Field(
363
+ default=None,
364
+ description=(
365
+ "If specified, the prefix cache will be salted with the provided "
366
+ "string to prevent an attacker to guess prompts in multi-user "
367
+ "environments. The salt should be random, protected from "
368
+ "access by 3rd parties, and long enough to be "
369
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
370
+ "to 256 bit)."
371
+ ),
372
+ )
373
+
374
+ enable_response_messages: bool = Field(
375
+ default=False,
376
+ description=(
377
+ "Dictates whether or not to return messages as part of the "
378
+ "response object. Currently only supported for"
379
+ "non-background and gpt-oss only. "
380
+ ),
381
+ )
382
+ # similar to input_messages / output_messages in ResponsesResponse
383
+ # we take in previous_input_messages (ie in harmony format)
384
+ # this cannot be used in conjunction with previous_response_id
385
+ # TODO: consider supporting non harmony messages as well
386
+ previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None
387
+ # --8<-- [end:responses-extra-params]
388
+
389
+ _DEFAULT_SAMPLING_PARAMS = {
390
+ "temperature": 1.0,
391
+ "top_p": 1.0,
392
+ "top_k": 0,
393
+ }
394
+
395
+ def to_sampling_params(
396
+ self,
397
+ default_max_tokens: int,
398
+ default_sampling_params: dict | None = None,
399
+ ) -> SamplingParams:
400
+ if self.max_output_tokens is None:
401
+ max_tokens = default_max_tokens
402
+ else:
403
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
404
+
405
+ default_sampling_params = default_sampling_params or {}
406
+ if (temperature := self.temperature) is None:
407
+ temperature = default_sampling_params.get(
408
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
409
+ )
410
+ if (top_p := self.top_p) is None:
411
+ top_p = default_sampling_params.get(
412
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
413
+ )
414
+ if (top_k := self.top_k) is None:
415
+ top_k = default_sampling_params.get(
416
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
417
+ )
418
+ stop_token_ids = default_sampling_params.get("stop_token_ids")
419
+
420
+ # Structured output
421
+ structured_outputs = None
422
+ if self.text is not None and self.text.format is not None:
423
+ response_format = self.text.format
424
+ if (
425
+ response_format.type == "json_schema"
426
+ and response_format.schema_ is not None
427
+ ):
428
+ structured_outputs = StructuredOutputsParams(
429
+ json=response_format.schema_
430
+ )
431
+ elif response_format.type == "json_object":
432
+ raise NotImplementedError("json_object is not supported")
433
+
434
+ # TODO: add more parameters
435
+ return SamplingParams.from_optional(
436
+ temperature=temperature,
437
+ top_p=top_p,
438
+ top_k=top_k,
439
+ max_tokens=max_tokens,
440
+ logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
441
+ stop_token_ids=stop_token_ids,
442
+ output_kind=(
443
+ RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
444
+ ),
445
+ structured_outputs=structured_outputs,
446
+ logit_bias=self.logit_bias,
447
+ )
448
+
449
+ def is_include_output_logprobs(self) -> bool:
450
+ """Check if the request includes output logprobs."""
451
+ if self.include is None:
452
+ return False
453
+ return (
454
+ isinstance(self.include, list)
455
+ and "message.output_text.logprobs" in self.include
456
+ )
457
+
458
+ @model_validator(mode="before")
459
+ def validate_background(cls, data):
460
+ if not data.get("background"):
461
+ return data
462
+ if not data.get("store", True):
463
+ raise ValueError("background can only be used when `store` is true")
464
+ return data
465
+
466
+ @model_validator(mode="before")
467
+ def validate_prompt(cls, data):
468
+ if data.get("prompt") is not None:
469
+ raise ValueError("prompt template is not supported")
470
+ return data
471
+
472
+ @model_validator(mode="before")
473
+ def check_cache_salt_support(cls, data):
474
+ if data.get("cache_salt") is not None and (
475
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
476
+ ):
477
+ raise ValueError(
478
+ "Parameter 'cache_salt' must be a non-empty string if provided."
479
+ )
480
+ return data
481
+
482
+ @model_validator(mode="before")
483
+ def function_call_parsing(cls, data):
484
+ """Parse function_call dictionaries into ResponseFunctionToolCall objects.
485
+ This ensures Pydantic can properly resolve union types in the input field.
486
+ Function calls provided as dicts are converted to ResponseFunctionToolCall
487
+ objects before validation, while invalid structures are left for Pydantic
488
+ to reject with appropriate error messages.
489
+ """
490
+
491
+ input_data = data.get("input")
492
+
493
+ # Early return for None, strings, or bytes
494
+ # (strings are iterable but shouldn't be processed)
495
+ if input_data is None or isinstance(input_data, (str, bytes)):
496
+ return data
497
+
498
+ # Convert iterators (like ValidatorIterator) to list
499
+ if not isinstance(input_data, list):
500
+ try:
501
+ input_data = list(input_data)
502
+ except TypeError:
503
+ # Not iterable, leave as-is for Pydantic to handle
504
+ return data
505
+
506
+ processed_input = []
507
+ for item in input_data:
508
+ if isinstance(item, dict) and item.get("type") == "function_call":
509
+ try:
510
+ processed_input.append(ResponseFunctionToolCall(**item))
511
+ except ValidationError:
512
+ # Let Pydantic handle validation for malformed function calls
513
+ logger.debug(
514
+ "Failed to parse function_call to ResponseFunctionToolCall, "
515
+ "leaving for Pydantic validation"
516
+ )
517
+ processed_input.append(item)
518
+ else:
519
+ processed_input.append(item)
520
+
521
+ data["input"] = processed_input
522
+ return data
523
+
524
+
525
+ class ChatCompletionRequest(OpenAIBaseModel):
526
+ # Ordered by official OpenAI API documentation
527
+ # https://platform.openai.com/docs/api-reference/chat/create
528
+ messages: list[ChatCompletionMessageParam]
529
+ model: str | None = None
530
+ frequency_penalty: float | None = 0.0
531
+ logit_bias: dict[str, float] | None = None
532
+ logprobs: bool | None = False
533
+ top_logprobs: int | None = 0
534
+ max_tokens: int | None = Field(
535
+ default=None,
536
+ deprecated="max_tokens is deprecated in favor of "
537
+ "the max_completion_tokens field",
538
+ )
539
+ max_completion_tokens: int | None = None
540
+ n: int | None = 1
541
+ presence_penalty: float | None = 0.0
542
+ response_format: AnyResponseFormat | None = None
543
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
544
+ stop: str | list[str] | None = []
545
+ stream: bool | None = False
546
+ stream_options: StreamOptions | None = None
547
+ temperature: float | None = None
548
+ top_p: float | None = None
549
+ tools: list[ChatCompletionToolsParam] | None = None
550
+ tool_choice: (
551
+ Literal["none"]
552
+ | Literal["auto"]
553
+ | Literal["required"]
554
+ | ChatCompletionNamedToolChoiceParam
555
+ | None
556
+ ) = "none"
557
+ reasoning_effort: Literal["low", "medium", "high"] | None = None
558
+ include_reasoning: bool = True
559
+ parallel_tool_calls: bool | None = True
560
+
561
+ # NOTE this will be ignored by vLLM
562
+ user: str | None = None
563
+
564
+ # --8<-- [start:chat-completion-sampling-params]
565
+ use_beam_search: bool = False
566
+ top_k: int | None = None
567
+ min_p: float | None = None
568
+ repetition_penalty: float | None = None
569
+ length_penalty: float = 1.0
570
+ stop_token_ids: list[int] | None = []
571
+ include_stop_str_in_output: bool = False
572
+ ignore_eos: bool = False
573
+ min_tokens: int = 0
574
+ skip_special_tokens: bool = True
575
+ spaces_between_special_tokens: bool = True
576
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
577
+ prompt_logprobs: int | None = None
578
+ allowed_token_ids: list[int] | None = None
579
+ bad_words: list[str] = Field(default_factory=list)
580
+ # --8<-- [end:chat-completion-sampling-params]
581
+
582
+ # --8<-- [start:chat-completion-extra-params]
583
+ echo: bool = Field(
584
+ default=False,
585
+ description=(
586
+ "If true, the new message will be prepended with the last message "
587
+ "if they belong to the same role."
588
+ ),
589
+ )
590
+ add_generation_prompt: bool = Field(
591
+ default=True,
592
+ description=(
593
+ "If true, the generation prompt will be added to the chat template. "
594
+ "This is a parameter used by chat template in tokenizer config of the "
595
+ "model."
596
+ ),
597
+ )
598
+ continue_final_message: bool = Field(
599
+ default=False,
600
+ description=(
601
+ "If this is set, the chat will be formatted so that the final "
602
+ "message in the chat is open-ended, without any EOS tokens. The "
603
+ "model will continue this message rather than starting a new one. "
604
+ 'This allows you to "prefill" part of the model\'s response for it. '
605
+ "Cannot be used at the same time as `add_generation_prompt`."
606
+ ),
607
+ )
608
+ add_special_tokens: bool = Field(
609
+ default=False,
610
+ description=(
611
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
612
+ "on top of what is added by the chat template. "
613
+ "For most models, the chat template takes care of adding the "
614
+ "special tokens so this should be set to false (as is the "
615
+ "default)."
616
+ ),
617
+ )
618
+ documents: list[dict[str, str]] | None = Field(
619
+ default=None,
620
+ description=(
621
+ "A list of dicts representing documents that will be accessible to "
622
+ "the model if it is performing RAG (retrieval-augmented generation)."
623
+ " If the template does not support RAG, this argument will have no "
624
+ "effect. We recommend that each document should be a dict containing "
625
+ '"title" and "text" keys.'
626
+ ),
627
+ )
628
+ chat_template: str | None = Field(
629
+ default=None,
630
+ description=(
631
+ "A Jinja template to use for this conversion. "
632
+ "As of transformers v4.44, default chat template is no longer "
633
+ "allowed, so you must provide a chat template if the tokenizer "
634
+ "does not define one."
635
+ ),
636
+ )
637
+ chat_template_kwargs: dict[str, Any] | None = Field(
638
+ default=None,
639
+ description=(
640
+ "Additional keyword args to pass to the template renderer. "
641
+ "Will be accessible by the chat template."
642
+ ),
643
+ )
644
+ mm_processor_kwargs: dict[str, Any] | None = Field(
645
+ default=None,
646
+ description=("Additional kwargs to pass to the HF processor."),
647
+ )
648
+ structured_outputs: StructuredOutputsParams | None = Field(
649
+ default=None,
650
+ description="Additional kwargs for structured outputs",
651
+ )
652
+ priority: int = Field(
653
+ default=0,
654
+ description=(
655
+ "The priority of the request (lower means earlier handling; "
656
+ "default: 0). Any priority other than 0 will raise an error "
657
+ "if the served model does not use priority scheduling."
658
+ ),
659
+ )
660
+ request_id: str = Field(
661
+ default_factory=random_uuid,
662
+ description=(
663
+ "The request_id related to this request. If the caller does "
664
+ "not set it, a random_uuid will be generated. This id is used "
665
+ "through out the inference process and return in response."
666
+ ),
667
+ )
668
+ logits_processors: LogitsProcessors | None = Field(
669
+ default=None,
670
+ description=(
671
+ "A list of either qualified names of logits processors, or "
672
+ "constructor objects, to apply when sampling. A constructor is "
673
+ "a JSON object with a required 'qualname' field specifying the "
674
+ "qualified name of the processor class/factory, and optional "
675
+ "'args' and 'kwargs' fields containing positional and keyword "
676
+ "arguments. For example: {'qualname': "
677
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
678
+ "{'param': 'value'}}."
679
+ ),
680
+ )
681
+ return_tokens_as_token_ids: bool | None = Field(
682
+ default=None,
683
+ description=(
684
+ "If specified with 'logprobs', tokens are represented "
685
+ " as strings of the form 'token_id:{token_id}' so that tokens "
686
+ "that are not JSON-encodable can be identified."
687
+ ),
688
+ )
689
+ return_token_ids: bool | None = Field(
690
+ default=None,
691
+ description=(
692
+ "If specified, the result will include token IDs alongside the "
693
+ "generated text. In streaming mode, prompt_token_ids is included "
694
+ "only in the first chunk, and token_ids contains the delta tokens "
695
+ "for each chunk. This is useful for debugging or when you "
696
+ "need to map generated text back to input tokens."
697
+ ),
698
+ )
699
+ cache_salt: str | None = Field(
700
+ default=None,
701
+ description=(
702
+ "If specified, the prefix cache will be salted with the provided "
703
+ "string to prevent an attacker to guess prompts in multi-user "
704
+ "environments. The salt should be random, protected from "
705
+ "access by 3rd parties, and long enough to be "
706
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
707
+ "to 256 bit)."
708
+ ),
709
+ )
710
+ kv_transfer_params: dict[str, Any] | None = Field(
711
+ default=None,
712
+ description="KVTransfer parameters used for disaggregated serving.",
713
+ )
714
+
715
+ vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
716
+ default=None,
717
+ description=(
718
+ "Additional request parameters with (list of) string or "
719
+ "numeric values, used by custom extensions."
720
+ ),
721
+ )
722
+
723
+ # --8<-- [end:chat-completion-extra-params]
724
+
725
+ # Default sampling parameters for chat completion requests
726
+ _DEFAULT_SAMPLING_PARAMS: dict = {
727
+ "repetition_penalty": 1.0,
728
+ "temperature": 1.0,
729
+ "top_p": 1.0,
730
+ "top_k": 0,
731
+ "min_p": 0.0,
732
+ }
733
+
734
+ def to_beam_search_params(
735
+ self, max_tokens: int, default_sampling_params: dict
736
+ ) -> BeamSearchParams:
737
+ n = self.n if self.n is not None else 1
738
+ if (temperature := self.temperature) is None:
739
+ temperature = default_sampling_params.get(
740
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
741
+ )
742
+
743
+ return BeamSearchParams(
744
+ beam_width=n,
745
+ max_tokens=max_tokens,
746
+ ignore_eos=self.ignore_eos,
747
+ temperature=temperature,
748
+ length_penalty=self.length_penalty,
749
+ include_stop_str_in_output=self.include_stop_str_in_output,
750
+ )
751
+
752
+ def to_sampling_params(
753
+ self,
754
+ max_tokens: int,
755
+ logits_processor_pattern: str | None,
756
+ default_sampling_params: dict,
757
+ ) -> SamplingParams:
758
+ # Default parameters
759
+ if (repetition_penalty := self.repetition_penalty) is None:
760
+ repetition_penalty = default_sampling_params.get(
761
+ "repetition_penalty",
762
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
763
+ )
764
+ if (temperature := self.temperature) is None:
765
+ temperature = default_sampling_params.get(
766
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
767
+ )
768
+ if (top_p := self.top_p) is None:
769
+ top_p = default_sampling_params.get(
770
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
771
+ )
772
+ if (top_k := self.top_k) is None:
773
+ top_k = default_sampling_params.get(
774
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
775
+ )
776
+ if (min_p := self.min_p) is None:
777
+ min_p = default_sampling_params.get(
778
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
779
+ )
780
+
781
+ prompt_logprobs = self.prompt_logprobs
782
+ if prompt_logprobs is None and self.echo:
783
+ prompt_logprobs = self.top_logprobs
784
+
785
+ response_format = self.response_format
786
+ if response_format is not None:
787
+ # If structured outputs wasn't already enabled,
788
+ # we must enable it for these features to work
789
+ if self.structured_outputs is None:
790
+ self.structured_outputs = StructuredOutputsParams()
791
+
792
+ # Set structured output params for response format
793
+ if response_format.type == "json_object":
794
+ self.structured_outputs.json_object = True
795
+ elif response_format.type == "json_schema":
796
+ json_schema = response_format.json_schema
797
+ assert json_schema is not None
798
+ self.structured_outputs.json = json_schema.json_schema
799
+ elif response_format.type == "structural_tag":
800
+ structural_tag = response_format
801
+ assert structural_tag is not None and isinstance(
802
+ structural_tag,
803
+ (
804
+ LegacyStructuralTagResponseFormat,
805
+ StructuralTagResponseFormat,
806
+ ),
807
+ )
808
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
809
+ self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
810
+
811
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
812
+ if self.kv_transfer_params:
813
+ # Pass in kv_transfer_params via extra_args
814
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
815
+ return SamplingParams.from_optional(
816
+ n=self.n,
817
+ presence_penalty=self.presence_penalty,
818
+ frequency_penalty=self.frequency_penalty,
819
+ repetition_penalty=repetition_penalty,
820
+ temperature=temperature,
821
+ top_p=top_p,
822
+ top_k=top_k,
823
+ min_p=min_p,
824
+ seed=self.seed,
825
+ stop=self.stop,
826
+ stop_token_ids=self.stop_token_ids,
827
+ logprobs=self.top_logprobs if self.logprobs else None,
828
+ prompt_logprobs=prompt_logprobs,
829
+ ignore_eos=self.ignore_eos,
830
+ max_tokens=max_tokens,
831
+ min_tokens=self.min_tokens,
832
+ skip_special_tokens=self.skip_special_tokens,
833
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
834
+ logits_processors=get_logits_processors(
835
+ self.logits_processors, logits_processor_pattern
836
+ ),
837
+ include_stop_str_in_output=self.include_stop_str_in_output,
838
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
839
+ output_kind=RequestOutputKind.DELTA
840
+ if self.stream
841
+ else RequestOutputKind.FINAL_ONLY,
842
+ structured_outputs=self.structured_outputs,
843
+ logit_bias=self.logit_bias,
844
+ bad_words=self.bad_words,
845
+ allowed_token_ids=self.allowed_token_ids,
846
+ extra_args=extra_args or None,
847
+ )
848
+
849
+ @model_validator(mode="before")
850
+ @classmethod
851
+ def validate_stream_options(cls, data):
852
+ if data.get("stream_options") and not data.get("stream"):
853
+ raise ValueError("Stream options can only be defined when `stream=True`.")
854
+
855
+ return data
856
+
857
+ @model_validator(mode="before")
858
+ @classmethod
859
+ def check_logprobs(cls, data):
860
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
861
+ if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
862
+ raise ValueError(
863
+ "`prompt_logprobs` are not available when `stream=True`."
864
+ )
865
+
866
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
867
+ raise ValueError("`prompt_logprobs` must be a positive value or -1.")
868
+ if (top_logprobs := data.get("top_logprobs")) is not None:
869
+ if top_logprobs < 0 and top_logprobs != -1:
870
+ raise ValueError("`top_logprobs` must be a positive value or -1.")
871
+
872
+ if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
873
+ raise ValueError(
874
+ "when using `top_logprobs`, `logprobs` must be set to true."
875
+ )
876
+
877
+ return data
878
+
879
+ @model_validator(mode="before")
880
+ @classmethod
881
+ def check_structured_outputs_count(cls, data):
882
+ if isinstance(data, ValueError):
883
+ raise data
884
+
885
+ if data.get("structured_outputs", None) is None:
886
+ return data
887
+
888
+ structured_outputs_kwargs = data["structured_outputs"]
889
+ count = sum(
890
+ structured_outputs_kwargs.get(k) is not None
891
+ for k in ("json", "regex", "choice")
892
+ )
893
+ # you can only use one kind of constraints for structured outputs
894
+ if count > 1:
895
+ raise ValueError(
896
+ "You can only use one kind of constraints for structured "
897
+ "outputs ('json', 'regex' or 'choice')."
898
+ )
899
+ # you can only either use structured outputs or tools, not both
900
+ if count > 1 and data.get("tool_choice", "none") not in (
901
+ "none",
902
+ "auto",
903
+ "required",
904
+ ):
905
+ raise ValueError(
906
+ "You can only either use constraints for structured outputs "
907
+ "or tools, not both."
908
+ )
909
+ return data
910
+
911
+ @model_validator(mode="before")
912
+ @classmethod
913
+ def check_tool_usage(cls, data):
914
+ # if "tool_choice" is not specified but tools are provided,
915
+ # default to "auto" tool_choice
916
+ if "tool_choice" not in data and data.get("tools"):
917
+ data["tool_choice"] = "auto"
918
+
919
+ # if "tool_choice" is "none" -- no validation is needed for tools
920
+ if "tool_choice" in data and data["tool_choice"] == "none":
921
+ return data
922
+
923
+ # if "tool_choice" is specified -- validation
924
+ if "tool_choice" in data and data["tool_choice"] is not None:
925
+ # ensure that if "tool choice" is specified, tools are present
926
+ if "tools" not in data or data["tools"] is None:
927
+ raise ValueError("When using `tool_choice`, `tools` must be set.")
928
+
929
+ # make sure that tool choice is either a named tool
930
+ # OR that it's set to "auto" or "required"
931
+ if data["tool_choice"] not in ["auto", "required"] and not isinstance(
932
+ data["tool_choice"], dict
933
+ ):
934
+ raise ValueError(
935
+ f"Invalid value for `tool_choice`: {data['tool_choice']}! "
936
+ 'Only named tools, "none", "auto" or "required" '
937
+ "are supported."
938
+ )
939
+
940
+ # if tool_choice is "required" but the "tools" list is empty,
941
+ # override the data to behave like "none" to align with
942
+ # OpenAI’s behavior.
943
+ if (
944
+ data["tool_choice"] == "required"
945
+ and isinstance(data["tools"], list)
946
+ and len(data["tools"]) == 0
947
+ ):
948
+ data["tool_choice"] = "none"
949
+ del data["tools"]
950
+ return data
951
+
952
+ # ensure that if "tool_choice" is specified as an object,
953
+ # it matches a valid tool
954
+ correct_usage_message = (
955
+ 'Correct usage: `{"type": "function",'
956
+ ' "function": {"name": "my_function"}}`'
957
+ )
958
+ if isinstance(data["tool_choice"], dict):
959
+ valid_tool = False
960
+ function = data["tool_choice"].get("function")
961
+ if not isinstance(function, dict):
962
+ raise ValueError(
963
+ f"Invalid value for `function`: `{function}` in "
964
+ f"`tool_choice`! {correct_usage_message}"
965
+ )
966
+ if "name" not in function:
967
+ raise ValueError(
968
+ f"Expected field `name` in `function` in "
969
+ f"`tool_choice`! {correct_usage_message}"
970
+ )
971
+ function_name = function["name"]
972
+ if not isinstance(function_name, str) or len(function_name) == 0:
973
+ raise ValueError(
974
+ f"Invalid `name` in `function`: `{function_name}`"
975
+ f" in `tool_choice`! {correct_usage_message}"
976
+ )
977
+ for tool in data["tools"]:
978
+ if tool["function"]["name"] == function_name:
979
+ valid_tool = True
980
+ break
981
+ if not valid_tool:
982
+ raise ValueError(
983
+ "The tool specified in `tool_choice` does not match any"
984
+ " of the specified `tools`"
985
+ )
986
+ return data
987
+
988
+ @model_validator(mode="before")
989
+ @classmethod
990
+ def check_generation_prompt(cls, data):
991
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
992
+ raise ValueError(
993
+ "Cannot set both `continue_final_message` and "
994
+ "`add_generation_prompt` to True."
995
+ )
996
+ return data
997
+
998
+ @model_validator(mode="before")
999
+ @classmethod
1000
+ def check_cache_salt_support(cls, data):
1001
+ if data.get("cache_salt") is not None and (
1002
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
1003
+ ):
1004
+ raise ValueError(
1005
+ "Parameter 'cache_salt' must be a non-empty string if provided."
1006
+ )
1007
+ return data
1008
+
1009
+
1010
+ class CompletionRequest(OpenAIBaseModel):
1011
+ # Ordered by official OpenAI API documentation
1012
+ # https://platform.openai.com/docs/api-reference/completions/create
1013
+ model: str | None = None
1014
+ prompt: list[int] | list[list[int]] | str | list[str] | None = None
1015
+ echo: bool | None = False
1016
+ frequency_penalty: float | None = 0.0
1017
+ logit_bias: dict[str, float] | None = None
1018
+ logprobs: int | None = None
1019
+ max_tokens: int | None = 16
1020
+ n: int = 1
1021
+ presence_penalty: float | None = 0.0
1022
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1023
+ stop: str | list[str] | None = []
1024
+ stream: bool | None = False
1025
+ stream_options: StreamOptions | None = None
1026
+ suffix: str | None = None
1027
+ temperature: float | None = None
1028
+ top_p: float | None = None
1029
+ user: str | None = None
1030
+
1031
+ # --8<-- [start:completion-sampling-params]
1032
+ use_beam_search: bool = False
1033
+ top_k: int | None = None
1034
+ min_p: float | None = None
1035
+ repetition_penalty: float | None = None
1036
+ length_penalty: float = 1.0
1037
+ stop_token_ids: list[int] | None = []
1038
+ include_stop_str_in_output: bool = False
1039
+ ignore_eos: bool = False
1040
+ min_tokens: int = 0
1041
+ skip_special_tokens: bool = True
1042
+ spaces_between_special_tokens: bool = True
1043
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
1044
+ allowed_token_ids: list[int] | None = None
1045
+ prompt_logprobs: int | None = None
1046
+ # --8<-- [end:completion-sampling-params]
1047
+
1048
+ # --8<-- [start:completion-extra-params]
1049
+ prompt_embeds: bytes | list[bytes] | None = None
1050
+ add_special_tokens: bool = Field(
1051
+ default=True,
1052
+ description=(
1053
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1054
+ "the prompt."
1055
+ ),
1056
+ )
1057
+ response_format: AnyResponseFormat | None = Field(
1058
+ default=None,
1059
+ description=(
1060
+ "Similar to chat completion, this parameter specifies the format "
1061
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
1062
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
1063
+ ),
1064
+ )
1065
+ structured_outputs: StructuredOutputsParams | None = Field(
1066
+ default=None,
1067
+ description="Additional kwargs for structured outputs",
1068
+ )
1069
+ priority: int = Field(
1070
+ default=0,
1071
+ description=(
1072
+ "The priority of the request (lower means earlier handling; "
1073
+ "default: 0). Any priority other than 0 will raise an error "
1074
+ "if the served model does not use priority scheduling."
1075
+ ),
1076
+ )
1077
+ request_id: str = Field(
1078
+ default_factory=random_uuid,
1079
+ description=(
1080
+ "The request_id related to this request. If the caller does "
1081
+ "not set it, a random_uuid will be generated. This id is used "
1082
+ "through out the inference process and return in response."
1083
+ ),
1084
+ )
1085
+ logits_processors: LogitsProcessors | None = Field(
1086
+ default=None,
1087
+ description=(
1088
+ "A list of either qualified names of logits processors, or "
1089
+ "constructor objects, to apply when sampling. A constructor is "
1090
+ "a JSON object with a required 'qualname' field specifying the "
1091
+ "qualified name of the processor class/factory, and optional "
1092
+ "'args' and 'kwargs' fields containing positional and keyword "
1093
+ "arguments. For example: {'qualname': "
1094
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
1095
+ "{'param': 'value'}}."
1096
+ ),
1097
+ )
1098
+
1099
+ return_tokens_as_token_ids: bool | None = Field(
1100
+ default=None,
1101
+ description=(
1102
+ "If specified with 'logprobs', tokens are represented "
1103
+ " as strings of the form 'token_id:{token_id}' so that tokens "
1104
+ "that are not JSON-encodable can be identified."
1105
+ ),
1106
+ )
1107
+ return_token_ids: bool | None = Field(
1108
+ default=None,
1109
+ description=(
1110
+ "If specified, the result will include token IDs alongside the "
1111
+ "generated text. In streaming mode, prompt_token_ids is included "
1112
+ "only in the first chunk, and token_ids contains the delta tokens "
1113
+ "for each chunk. This is useful for debugging or when you "
1114
+ "need to map generated text back to input tokens."
1115
+ ),
1116
+ )
1117
+
1118
+ cache_salt: str | None = Field(
1119
+ default=None,
1120
+ description=(
1121
+ "If specified, the prefix cache will be salted with the provided "
1122
+ "string to prevent an attacker to guess prompts in multi-user "
1123
+ "environments. The salt should be random, protected from "
1124
+ "access by 3rd parties, and long enough to be "
1125
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
1126
+ "to 256 bit)."
1127
+ ),
1128
+ )
1129
+
1130
+ kv_transfer_params: dict[str, Any] | None = Field(
1131
+ default=None,
1132
+ description="KVTransfer parameters used for disaggregated serving.",
1133
+ )
1134
+
1135
+ vllm_xargs: dict[str, str | int | float] | None = Field(
1136
+ default=None,
1137
+ description=(
1138
+ "Additional request parameters with string or "
1139
+ "numeric values, used by custom extensions."
1140
+ ),
1141
+ )
1142
+
1143
+ # --8<-- [end:completion-extra-params]
1144
+
1145
+ # Default sampling parameters for completion requests
1146
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1147
+ "repetition_penalty": 1.0,
1148
+ "temperature": 1.0,
1149
+ "top_p": 1.0,
1150
+ "top_k": 0,
1151
+ "min_p": 0.0,
1152
+ }
1153
+
1154
+ def to_beam_search_params(
1155
+ self,
1156
+ max_tokens: int,
1157
+ default_sampling_params: dict | None = None,
1158
+ ) -> BeamSearchParams:
1159
+ if default_sampling_params is None:
1160
+ default_sampling_params = {}
1161
+ n = self.n if self.n is not None else 1
1162
+
1163
+ if (temperature := self.temperature) is None:
1164
+ temperature = default_sampling_params.get("temperature", 1.0)
1165
+
1166
+ return BeamSearchParams(
1167
+ beam_width=n,
1168
+ max_tokens=max_tokens,
1169
+ ignore_eos=self.ignore_eos,
1170
+ temperature=temperature,
1171
+ length_penalty=self.length_penalty,
1172
+ include_stop_str_in_output=self.include_stop_str_in_output,
1173
+ )
1174
+
1175
+ def to_sampling_params(
1176
+ self,
1177
+ max_tokens: int,
1178
+ logits_processor_pattern: str | None,
1179
+ default_sampling_params: dict | None = None,
1180
+ ) -> SamplingParams:
1181
+ if default_sampling_params is None:
1182
+ default_sampling_params = {}
1183
+
1184
+ # Default parameters
1185
+ if (repetition_penalty := self.repetition_penalty) is None:
1186
+ repetition_penalty = default_sampling_params.get(
1187
+ "repetition_penalty",
1188
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
1189
+ )
1190
+ if (temperature := self.temperature) is None:
1191
+ temperature = default_sampling_params.get(
1192
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
1193
+ )
1194
+ if (top_p := self.top_p) is None:
1195
+ top_p = default_sampling_params.get(
1196
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
1197
+ )
1198
+ if (top_k := self.top_k) is None:
1199
+ top_k = default_sampling_params.get(
1200
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
1201
+ )
1202
+ if (min_p := self.min_p) is None:
1203
+ min_p = default_sampling_params.get(
1204
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
1205
+ )
1206
+
1207
+ prompt_logprobs = self.prompt_logprobs
1208
+ if prompt_logprobs is None and self.echo:
1209
+ prompt_logprobs = self.logprobs
1210
+
1211
+ echo_without_generation = self.echo and self.max_tokens == 0
1212
+
1213
+ response_format = self.response_format
1214
+ if response_format is not None:
1215
+ # If structured outputs wasn't already enabled,
1216
+ # we must enable it for these features to work
1217
+ if self.structured_outputs is None:
1218
+ self.structured_outputs = StructuredOutputsParams()
1219
+
1220
+ # Set structured output params for response format
1221
+ if response_format.type == "json_object":
1222
+ self.structured_outputs.json_object = True
1223
+ elif response_format.type == "json_schema":
1224
+ json_schema = response_format.json_schema
1225
+ assert json_schema is not None
1226
+ self.structured_outputs.json = json_schema.json_schema
1227
+ elif response_format.type == "structural_tag":
1228
+ structural_tag = response_format
1229
+ assert structural_tag is not None and isinstance(
1230
+ structural_tag,
1231
+ (
1232
+ LegacyStructuralTagResponseFormat,
1233
+ StructuralTagResponseFormat,
1234
+ ),
1235
+ )
1236
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
1237
+ self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
1238
+
1239
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
1240
+ if self.kv_transfer_params:
1241
+ # Pass in kv_transfer_params via extra_args
1242
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
1243
+ return SamplingParams.from_optional(
1244
+ n=self.n,
1245
+ presence_penalty=self.presence_penalty,
1246
+ frequency_penalty=self.frequency_penalty,
1247
+ repetition_penalty=repetition_penalty,
1248
+ temperature=temperature,
1249
+ top_p=top_p,
1250
+ top_k=top_k,
1251
+ min_p=min_p,
1252
+ seed=self.seed,
1253
+ stop=self.stop,
1254
+ stop_token_ids=self.stop_token_ids,
1255
+ logprobs=self.logprobs,
1256
+ ignore_eos=self.ignore_eos,
1257
+ max_tokens=max_tokens if not echo_without_generation else 1,
1258
+ min_tokens=self.min_tokens,
1259
+ prompt_logprobs=prompt_logprobs,
1260
+ skip_special_tokens=self.skip_special_tokens,
1261
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
1262
+ include_stop_str_in_output=self.include_stop_str_in_output,
1263
+ logits_processors=get_logits_processors(
1264
+ self.logits_processors, logits_processor_pattern
1265
+ ),
1266
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1267
+ output_kind=RequestOutputKind.DELTA
1268
+ if self.stream
1269
+ else RequestOutputKind.FINAL_ONLY,
1270
+ structured_outputs=self.structured_outputs,
1271
+ logit_bias=self.logit_bias,
1272
+ allowed_token_ids=self.allowed_token_ids,
1273
+ extra_args=extra_args or None,
1274
+ )
1275
+
1276
+ @model_validator(mode="before")
1277
+ @classmethod
1278
+ def check_structured_outputs_count(cls, data):
1279
+ if data.get("structured_outputs", None) is None:
1280
+ return data
1281
+
1282
+ structured_outputs_kwargs = data["structured_outputs"]
1283
+ count = sum(
1284
+ structured_outputs_kwargs.get(k) is not None
1285
+ for k in ("json", "regex", "choice")
1286
+ )
1287
+ if count > 1:
1288
+ raise ValueError(
1289
+ "You can only use one kind of constraints for structured "
1290
+ "outputs ('json', 'regex' or 'choice')."
1291
+ )
1292
+ return data
1293
+
1294
+ @model_validator(mode="before")
1295
+ @classmethod
1296
+ def check_logprobs(cls, data):
1297
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1298
+ if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
1299
+ raise ValueError(
1300
+ "`prompt_logprobs` are not available when `stream=True`."
1301
+ )
1302
+
1303
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
1304
+ raise ValueError("`prompt_logprobs` must be a positive value or -1.")
1305
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1306
+ raise ValueError("`logprobs` must be a positive value.")
1307
+
1308
+ return data
1309
+
1310
+ @model_validator(mode="before")
1311
+ @classmethod
1312
+ def validate_stream_options(cls, data):
1313
+ if data.get("stream_options") and not data.get("stream"):
1314
+ raise ValueError("Stream options can only be defined when `stream=True`.")
1315
+
1316
+ return data
1317
+
1318
+ @model_validator(mode="before")
1319
+ @classmethod
1320
+ def validate_prompt_and_prompt_embeds(cls, data):
1321
+ prompt = data.get("prompt")
1322
+ prompt_embeds = data.get("prompt_embeds")
1323
+
1324
+ prompt_is_empty = prompt is None or (isinstance(prompt, str) and prompt == "")
1325
+ embeds_is_empty = prompt_embeds is None or (
1326
+ isinstance(prompt_embeds, list) and len(prompt_embeds) == 0
1327
+ )
1328
+
1329
+ if prompt_is_empty and embeds_is_empty:
1330
+ raise ValueError(
1331
+ "Either prompt or prompt_embeds must be provided and non-empty."
1332
+ )
1333
+
1334
+ return data
1335
+
1336
+ @model_validator(mode="before")
1337
+ @classmethod
1338
+ def check_cache_salt_support(cls, data):
1339
+ if data.get("cache_salt") is not None and (
1340
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
1341
+ ):
1342
+ raise ValueError(
1343
+ "Parameter 'cache_salt' must be a non-empty string if provided."
1344
+ )
1345
+ return data
1346
+
1347
+
1348
+ class CompletionLogProbs(OpenAIBaseModel):
1349
+ text_offset: list[int] = Field(default_factory=list)
1350
+ token_logprobs: list[float | None] = Field(default_factory=list)
1351
+ tokens: list[str] = Field(default_factory=list)
1352
+ top_logprobs: list[dict[str, float] | None] = Field(default_factory=list)
1353
+
1354
+
1355
+ class CompletionResponseChoice(OpenAIBaseModel):
1356
+ index: int
1357
+ text: str
1358
+ logprobs: CompletionLogProbs | None = None
1359
+ finish_reason: str | None = None
1360
+ stop_reason: int | str | None = Field(
1361
+ default=None,
1362
+ description=(
1363
+ "The stop string or token id that caused the completion "
1364
+ "to stop, None if the completion finished for some other reason "
1365
+ "including encountering the EOS token"
1366
+ ),
1367
+ )
1368
+ token_ids: list[int] | None = None # For response
1369
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
1370
+ prompt_token_ids: list[int] | None = None # For prompt
1371
+
1372
+
1373
+ class CompletionResponse(OpenAIBaseModel):
1374
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1375
+ object: Literal["text_completion"] = "text_completion"
1376
+ created: int = Field(default_factory=lambda: int(time.time()))
1377
+ model: str
1378
+ choices: list[CompletionResponseChoice]
1379
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
1380
+ system_fingerprint: str | None = None
1381
+ usage: UsageInfo
1382
+
1383
+ # vLLM-specific fields that are not in OpenAI spec
1384
+ kv_transfer_params: dict[str, Any] | None = Field(
1385
+ default=None, description="KVTransfer parameters."
1386
+ )
1387
+
1388
+
1389
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1390
+ index: int
1391
+ text: str
1392
+ logprobs: CompletionLogProbs | None = None
1393
+ finish_reason: str | None = None
1394
+ stop_reason: int | str | None = Field(
1395
+ default=None,
1396
+ description=(
1397
+ "The stop string or token id that caused the completion "
1398
+ "to stop, None if the completion finished for some other reason "
1399
+ "including encountering the EOS token"
1400
+ ),
1401
+ )
1402
+ # not part of the OpenAI spec but for tracing the tokens
1403
+ # prompt tokens is put into choice to align with CompletionResponseChoice
1404
+ prompt_token_ids: list[int] | None = None
1405
+ token_ids: list[int] | None = None
1406
+
1407
+
1408
+ class CompletionStreamResponse(OpenAIBaseModel):
1409
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1410
+ object: str = "text_completion"
1411
+ created: int = Field(default_factory=lambda: int(time.time()))
1412
+ model: str
1413
+ choices: list[CompletionResponseStreamChoice]
1414
+ usage: UsageInfo | None = Field(default=None)
1415
+
1416
+
1417
+ class FunctionCall(OpenAIBaseModel):
1418
+ name: str
1419
+ arguments: str
1420
+
1421
+
1422
+ class ToolCall(OpenAIBaseModel):
1423
+ id: str = Field(default_factory=make_tool_call_id)
1424
+ type: Literal["function"] = "function"
1425
+ function: FunctionCall
1426
+
1427
+
1428
+ class DeltaFunctionCall(BaseModel):
1429
+ name: str | None = None
1430
+ arguments: str | None = None
1431
+
1432
+
1433
+ # a tool call delta where everything is optional
1434
+ class DeltaToolCall(OpenAIBaseModel):
1435
+ id: str | None = None
1436
+ type: Literal["function"] | None = None
1437
+ index: int
1438
+ function: DeltaFunctionCall | None = None
1439
+
1440
+
1441
+ class ExtractedToolCallInformation(BaseModel):
1442
+ # indicate if tools were called
1443
+ tools_called: bool
1444
+
1445
+ # extracted tool calls
1446
+ tool_calls: list[ToolCall]
1447
+
1448
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1449
+ # But some models will do this intentionally
1450
+ content: str | None = None
1451
+
1452
+
1453
+ class ChatMessage(OpenAIBaseModel):
1454
+ role: str
1455
+ content: str | None = None
1456
+ refusal: str | None = None
1457
+ annotations: OpenAIAnnotation | None = None
1458
+ audio: OpenAIChatCompletionAudio | None = None
1459
+ function_call: FunctionCall | None = None
1460
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1461
+
1462
+ # vLLM-specific fields that are not in OpenAI spec
1463
+ reasoning: str | None = None
1464
+ reasoning_content: str | None = None
1465
+ """Deprecated: use `reasoning` instead."""
1466
+
1467
+ @model_validator(mode="after")
1468
+ def handle_deprecated_reasoning_content(self):
1469
+ """Copy reasoning to reasoning_content for backward compatibility."""
1470
+ self.reasoning_content = self.reasoning
1471
+ return self
1472
+
1473
+
1474
+ class ChatCompletionLogProb(OpenAIBaseModel):
1475
+ token: str
1476
+ logprob: float = -9999.0
1477
+ bytes: list[int] | None = None
1478
+
1479
+
1480
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1481
+ # Workaround: redefine fields name cache so that it's not
1482
+ # shared with the super class.
1483
+ field_names: ClassVar[set[str] | None] = None
1484
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1485
+
1486
+
1487
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1488
+ content: list[ChatCompletionLogProbsContent] | None = None
1489
+
1490
+
1491
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1492
+ index: int
1493
+ message: ChatMessage
1494
+ logprobs: ChatCompletionLogProbs | None = None
1495
+ # per OpenAI spec this is the default
1496
+ finish_reason: str | None = "stop"
1497
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1498
+ stop_reason: int | str | None = None
1499
+ # not part of the OpenAI spec but is useful for tracing the tokens
1500
+ # in agent scenarios
1501
+ token_ids: list[int] | None = None
1502
+
1503
+
1504
+ class ChatCompletionResponse(OpenAIBaseModel):
1505
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1506
+ object: Literal["chat.completion"] = "chat.completion"
1507
+ created: int = Field(default_factory=lambda: int(time.time()))
1508
+ model: str
1509
+ choices: list[ChatCompletionResponseChoice]
1510
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
1511
+ system_fingerprint: str | None = None
1512
+ usage: UsageInfo
1513
+
1514
+ # vLLM-specific fields that are not in OpenAI spec
1515
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
1516
+ prompt_token_ids: list[int] | None = None
1517
+ kv_transfer_params: dict[str, Any] | None = Field(
1518
+ default=None, description="KVTransfer parameters."
1519
+ )
1520
+
1521
+
1522
+ class DeltaMessage(OpenAIBaseModel):
1523
+ role: str | None = None
1524
+ content: str | None = None
1525
+ reasoning: str | None = None
1526
+ reasoning_content: str | None = None
1527
+ """Deprecated: use `reasoning` instead."""
1528
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1529
+
1530
+ @model_validator(mode="after")
1531
+ def handle_deprecated_reasoning_content(self):
1532
+ """Copy reasoning to reasoning_content for backward compatibility."""
1533
+ self.reasoning_content = self.reasoning
1534
+ return self
1535
+
1536
+
1537
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1538
+ index: int
1539
+ delta: DeltaMessage
1540
+ logprobs: ChatCompletionLogProbs | None = None
1541
+ finish_reason: str | None = None
1542
+ stop_reason: int | str | None = None
1543
+ # not part of the OpenAI spec but for tracing the tokens
1544
+ token_ids: list[int] | None = None
1545
+
1546
+
1547
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1548
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1549
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1550
+ created: int = Field(default_factory=lambda: int(time.time()))
1551
+ model: str
1552
+ choices: list[ChatCompletionResponseStreamChoice]
1553
+ usage: UsageInfo | None = Field(default=None)
1554
+ # not part of the OpenAI spec but for tracing the tokens
1555
+ prompt_token_ids: list[int] | None = None
1556
+
1557
+
1558
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1559
+ delta: DeltaMessage
1560
+ finish_reason: str | None = None
1561
+ stop_reason: int | str | None = None
1562
+
1563
+
1564
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1565
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1566
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1567
+ created: int = Field(default_factory=lambda: int(time.time()))
1568
+ model: str
1569
+ choices: list[TranscriptionResponseStreamChoice]
1570
+ usage: UsageInfo | None = Field(default=None)
1571
+
1572
+
1573
+ class InputTokensDetails(OpenAIBaseModel):
1574
+ cached_tokens: int
1575
+ input_tokens_per_turn: list[int] = Field(default_factory=list)
1576
+ cached_tokens_per_turn: list[int] = Field(default_factory=list)
1577
+
1578
+
1579
+ class OutputTokensDetails(OpenAIBaseModel):
1580
+ reasoning_tokens: int = 0
1581
+ tool_output_tokens: int = 0
1582
+ output_tokens_per_turn: list[int] = Field(default_factory=list)
1583
+ tool_output_tokens_per_turn: list[int] = Field(default_factory=list)
1584
+
1585
+
1586
+ class ResponseUsage(OpenAIBaseModel):
1587
+ input_tokens: int
1588
+ input_tokens_details: InputTokensDetails
1589
+ output_tokens: int
1590
+ output_tokens_details: OutputTokensDetails
1591
+ total_tokens: int
1592
+
1593
+
1594
+ def serialize_message(msg):
1595
+ """
1596
+ Serializes a single message
1597
+ """
1598
+ if isinstance(msg, dict):
1599
+ return msg
1600
+ elif hasattr(msg, "to_dict"):
1601
+ return msg.to_dict()
1602
+ else:
1603
+ # fallback to pyandic dump
1604
+ return msg.model_dump_json()
1605
+
1606
+
1607
+ def serialize_messages(msgs):
1608
+ """
1609
+ Serializes multiple messages
1610
+ """
1611
+ return [serialize_message(msg) for msg in msgs] if msgs else None
1612
+
1613
+
1614
+ class ResponseRawMessageAndToken(OpenAIBaseModel):
1615
+ """Class to show the raw message.
1616
+ If message / tokens diverge, tokens is the source of truth"""
1617
+
1618
+ message: str
1619
+ tokens: list[int]
1620
+ type: Literal["raw_message_tokens"] = "raw_message_tokens"
1621
+
1622
+
1623
+ ResponseInputOutputMessage: TypeAlias = (
1624
+ list[ChatCompletionMessageParam] | list[ResponseRawMessageAndToken]
1625
+ )
1626
+
1627
+
1628
+ class ResponsesResponse(OpenAIBaseModel):
1629
+ id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
1630
+ created_at: int = Field(default_factory=lambda: int(time.time()))
1631
+ # error: Optional[ResponseError] = None
1632
+ incomplete_details: IncompleteDetails | None = None
1633
+ instructions: str | None = None
1634
+ metadata: Metadata | None = None
1635
+ model: str
1636
+ object: Literal["response"] = "response"
1637
+ output: list[ResponseOutputItem]
1638
+ parallel_tool_calls: bool
1639
+ temperature: float
1640
+ tool_choice: ToolChoice
1641
+ tools: list[Tool]
1642
+ top_p: float
1643
+ background: bool
1644
+ max_output_tokens: int
1645
+ max_tool_calls: int | None = None
1646
+ previous_response_id: str | None = None
1647
+ prompt: ResponsePrompt | None = None
1648
+ reasoning: Reasoning | None = None
1649
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"]
1650
+ status: ResponseStatus
1651
+ text: ResponseTextConfig | None = None
1652
+ top_logprobs: int | None = None
1653
+ truncation: Literal["auto", "disabled"]
1654
+ usage: ResponseUsage | None = None
1655
+ user: str | None = None
1656
+
1657
+ # --8<-- [start:responses-extra-params]
1658
+ # These are populated when enable_response_messages is set to True
1659
+ # NOTE: custom serialization is needed
1660
+ # see serialize_input_messages and serialize_output_messages
1661
+ input_messages: ResponseInputOutputMessage | None = None
1662
+ output_messages: ResponseInputOutputMessage | None = None
1663
+ # --8<-- [end:responses-extra-params]
1664
+
1665
+ # NOTE: openAI harmony doesn't serialize TextContent properly,
1666
+ # TODO: this fixes for TextContent, but need to verify for tools etc
1667
+ # https://github.com/openai/harmony/issues/78
1668
+ @field_serializer("output_messages", when_used="json")
1669
+ def serialize_output_messages(self, msgs, _info):
1670
+ return serialize_messages(msgs)
1671
+
1672
+ # NOTE: openAI harmony doesn't serialize TextContent properly, this fixes it
1673
+ # https://github.com/openai/harmony/issues/78
1674
+ @field_serializer("input_messages", when_used="json")
1675
+ def serialize_input_messages(self, msgs, _info):
1676
+ return serialize_messages(msgs)
1677
+
1678
+ @classmethod
1679
+ def from_request(
1680
+ cls,
1681
+ request: ResponsesRequest,
1682
+ sampling_params: SamplingParams,
1683
+ model_name: str,
1684
+ created_time: int,
1685
+ output: list[ResponseOutputItem],
1686
+ status: ResponseStatus,
1687
+ usage: ResponseUsage | None = None,
1688
+ input_messages: ResponseInputOutputMessage | None = None,
1689
+ output_messages: ResponseInputOutputMessage | None = None,
1690
+ ) -> "ResponsesResponse":
1691
+ incomplete_details: IncompleteDetails | None = None
1692
+ if status == "incomplete":
1693
+ incomplete_details = IncompleteDetails(reason="max_output_tokens")
1694
+ # TODO: implement the other reason for incomplete_details,
1695
+ # which is content_filter
1696
+ # incomplete_details = IncompleteDetails(reason='content_filter')
1697
+ return cls(
1698
+ id=request.request_id,
1699
+ created_at=created_time,
1700
+ incomplete_details=incomplete_details,
1701
+ instructions=request.instructions,
1702
+ metadata=request.metadata,
1703
+ model=model_name,
1704
+ output=output,
1705
+ input_messages=input_messages,
1706
+ output_messages=output_messages,
1707
+ parallel_tool_calls=request.parallel_tool_calls,
1708
+ temperature=sampling_params.temperature,
1709
+ tool_choice=request.tool_choice,
1710
+ tools=request.tools,
1711
+ top_p=sampling_params.top_p,
1712
+ background=request.background,
1713
+ max_output_tokens=sampling_params.max_tokens,
1714
+ max_tool_calls=request.max_tool_calls,
1715
+ previous_response_id=request.previous_response_id,
1716
+ prompt=request.prompt,
1717
+ reasoning=request.reasoning,
1718
+ service_tier=request.service_tier,
1719
+ status=status,
1720
+ text=request.text,
1721
+ top_logprobs=sampling_params.logprobs,
1722
+ truncation=request.truncation,
1723
+ user=request.user,
1724
+ usage=usage,
1725
+ )
1726
+
1727
+
1728
+ # TODO: this code can be removed once
1729
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
1730
+ class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
1731
+ content_index: int
1732
+ """The index of the content part that is done."""
1733
+
1734
+ item_id: str
1735
+ """The ID of the output item that the content part was added to."""
1736
+
1737
+ output_index: int
1738
+ """The index of the output item that the content part was added to."""
1739
+
1740
+ part: ResponseReasoningTextContent
1741
+ """The content part that is done."""
1742
+
1743
+ sequence_number: int
1744
+ """The sequence number of this event."""
1745
+
1746
+ type: Literal["response.reasoning_part.done"]
1747
+ """The type of the event. Always `response.reasoning_part.done`."""
1748
+
1749
+
1750
+ # TODO: this code can be removed once
1751
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
1752
+ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
1753
+ content_index: int
1754
+ """The index of the content part that is done."""
1755
+
1756
+ item_id: str
1757
+ """The ID of the output item that the content part was added to."""
1758
+
1759
+ output_index: int
1760
+ """The index of the output item that the content part was added to."""
1761
+
1762
+ part: ResponseReasoningTextContent
1763
+ """The content part that is done."""
1764
+
1765
+ sequence_number: int
1766
+ """The sequence number of this event."""
1767
+
1768
+ type: Literal["response.reasoning_part.added"]
1769
+ """The type of the event. Always `response.reasoning_part.added`."""
1770
+
1771
+
1772
+ # vLLM Streaming Events
1773
+ # Note: we override the response type with the vLLM ResponsesResponse type
1774
+ class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
1775
+ response: ResponsesResponse # type: ignore[override]
1776
+
1777
+
1778
+ class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
1779
+ response: ResponsesResponse # type: ignore[override]
1780
+
1781
+
1782
+ class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
1783
+ response: ResponsesResponse # type: ignore[override]
1784
+
1785
+
1786
+ StreamingResponsesResponse: TypeAlias = (
1787
+ ResponseCreatedEvent
1788
+ | ResponseInProgressEvent
1789
+ | ResponseCompletedEvent
1790
+ | ResponseOutputItemAddedEvent
1791
+ | ResponseOutputItemDoneEvent
1792
+ | ResponseContentPartAddedEvent
1793
+ | ResponseContentPartDoneEvent
1794
+ | ResponseReasoningTextDeltaEvent
1795
+ | ResponseReasoningTextDoneEvent
1796
+ | ResponseReasoningPartAddedEvent
1797
+ | ResponseReasoningPartDoneEvent
1798
+ | ResponseCodeInterpreterCallInProgressEvent
1799
+ | ResponseCodeInterpreterCallCodeDeltaEvent
1800
+ | ResponseWebSearchCallInProgressEvent
1801
+ | ResponseWebSearchCallSearchingEvent
1802
+ | ResponseWebSearchCallCompletedEvent
1803
+ | ResponseCodeInterpreterCallCodeDoneEvent
1804
+ | ResponseCodeInterpreterCallInterpretingEvent
1805
+ | ResponseCodeInterpreterCallCompletedEvent
1806
+ | ResponseMcpCallArgumentsDeltaEvent
1807
+ | ResponseMcpCallArgumentsDoneEvent
1808
+ | ResponseMcpCallInProgressEvent
1809
+ | ResponseMcpCallCompletedEvent
1810
+ )
1811
+
1812
+
1813
+ class TokenizeCompletionRequest(OpenAIBaseModel):
1814
+ model: str | None = None
1815
+ prompt: str
1816
+
1817
+ add_special_tokens: bool = Field(
1818
+ default=True,
1819
+ description=(
1820
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1821
+ "the prompt."
1822
+ ),
1823
+ )
1824
+ return_token_strs: bool | None = Field(
1825
+ default=False,
1826
+ description=(
1827
+ "If true, also return the token strings corresponding to the token ids."
1828
+ ),
1829
+ )
1830
+
1831
+
1832
+ class TokenizeChatRequest(OpenAIBaseModel):
1833
+ model: str | None = None
1834
+ messages: list[ChatCompletionMessageParam]
1835
+
1836
+ add_generation_prompt: bool = Field(
1837
+ default=True,
1838
+ description=(
1839
+ "If true, the generation prompt will be added to the chat template. "
1840
+ "This is a parameter used by chat template in tokenizer config of the "
1841
+ "model."
1842
+ ),
1843
+ )
1844
+ return_token_strs: bool | None = Field(
1845
+ default=False,
1846
+ description=(
1847
+ "If true, also return the token strings corresponding to the token ids."
1848
+ ),
1849
+ )
1850
+ continue_final_message: bool = Field(
1851
+ default=False,
1852
+ description=(
1853
+ "If this is set, the chat will be formatted so that the final "
1854
+ "message in the chat is open-ended, without any EOS tokens. The "
1855
+ "model will continue this message rather than starting a new one. "
1856
+ 'This allows you to "prefill" part of the model\'s response for it. '
1857
+ "Cannot be used at the same time as `add_generation_prompt`."
1858
+ ),
1859
+ )
1860
+ add_special_tokens: bool = Field(
1861
+ default=False,
1862
+ description=(
1863
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1864
+ "on top of what is added by the chat template. "
1865
+ "For most models, the chat template takes care of adding the "
1866
+ "special tokens so this should be set to false (as is the "
1867
+ "default)."
1868
+ ),
1869
+ )
1870
+ chat_template: str | None = Field(
1871
+ default=None,
1872
+ description=(
1873
+ "A Jinja template to use for this conversion. "
1874
+ "As of transformers v4.44, default chat template is no longer "
1875
+ "allowed, so you must provide a chat template if the tokenizer "
1876
+ "does not define one."
1877
+ ),
1878
+ )
1879
+ chat_template_kwargs: dict[str, Any] | None = Field(
1880
+ default=None,
1881
+ description=(
1882
+ "Additional keyword args to pass to the template renderer. "
1883
+ "Will be accessible by the chat template."
1884
+ ),
1885
+ )
1886
+ mm_processor_kwargs: dict[str, Any] | None = Field(
1887
+ default=None,
1888
+ description=("Additional kwargs to pass to the HF processor."),
1889
+ )
1890
+ tools: list[ChatCompletionToolsParam] | None = Field(
1891
+ default=None,
1892
+ description=("A list of tools the model may call."),
1893
+ )
1894
+
1895
+ @model_validator(mode="before")
1896
+ @classmethod
1897
+ def check_generation_prompt(cls, data):
1898
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
1899
+ raise ValueError(
1900
+ "Cannot set both `continue_final_message` and "
1901
+ "`add_generation_prompt` to True."
1902
+ )
1903
+ return data
1904
+
1905
+
1906
+ TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
1907
+
1908
+
1909
+ class TokenizeResponse(OpenAIBaseModel):
1910
+ count: int
1911
+ max_model_len: int
1912
+ tokens: list[int]
1913
+ token_strs: list[str] | None = None
1914
+
1915
+
1916
+ class DetokenizeRequest(OpenAIBaseModel):
1917
+ model: str | None = None
1918
+ tokens: list[int]
1919
+
1920
+
1921
+ class DetokenizeResponse(OpenAIBaseModel):
1922
+ prompt: str
1923
+
1924
+
1925
+ class TokenizerInfoResponse(OpenAIBaseModel):
1926
+ """
1927
+ Response containing tokenizer configuration
1928
+ equivalent to tokenizer_config.json
1929
+ """
1930
+
1931
+ model_config = ConfigDict(extra="allow")
1932
+ tokenizer_class: str
1933
+
1934
+
1935
+ class LoadLoRAAdapterRequest(BaseModel):
1936
+ lora_name: str
1937
+ lora_path: str
1938
+
1939
+
1940
+ class UnloadLoRAAdapterRequest(BaseModel):
1941
+ lora_name: str
1942
+ lora_int_id: int | None = Field(default=None)
1943
+
1944
+
1945
+ ## Protocols for Audio
1946
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
1947
+
1948
+
1949
+ class TranscriptionRequest(OpenAIBaseModel):
1950
+ # Ordered by official OpenAI API documentation
1951
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
1952
+
1953
+ file: UploadFile
1954
+ """
1955
+ The audio file object (not file name) to transcribe, in one of these
1956
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
1957
+ """
1958
+
1959
+ model: str | None = None
1960
+ """ID of the model to use.
1961
+ """
1962
+
1963
+ language: str | None = None
1964
+ """The language of the input audio.
1965
+
1966
+ Supplying the input language in
1967
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
1968
+ will improve accuracy and latency.
1969
+ """
1970
+
1971
+ prompt: str = Field(default="")
1972
+ """An optional text to guide the model's style or continue a previous audio
1973
+ segment.
1974
+
1975
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1976
+ should match the audio language.
1977
+ """
1978
+
1979
+ response_format: AudioResponseFormat = Field(default="json")
1980
+ """
1981
+ The format of the output, in one of these options: `json`, `text`, `srt`,
1982
+ `verbose_json`, or `vtt`.
1983
+ """
1984
+
1985
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
1986
+
1987
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
1988
+ alias="timestamp_granularities[]", default=[]
1989
+ )
1990
+ """The timestamp granularities to populate for this transcription.
1991
+
1992
+ `response_format` must be set `verbose_json` to use timestamp granularities.
1993
+ Either or both of these options are supported: `word`, or `segment`. Note:
1994
+ There is no additional latency for segment timestamps, but generating word
1995
+ timestamps incurs additional latency.
1996
+ """
1997
+
1998
+ stream: bool | None = False
1999
+ """When set, it will enable output to be streamed in a similar fashion
2000
+ as the Chat Completion endpoint.
2001
+ """
2002
+ # --8<-- [start:transcription-extra-params]
2003
+ # Flattened stream option to simplify form data.
2004
+ stream_include_usage: bool | None = False
2005
+ stream_continuous_usage_stats: bool | None = False
2006
+
2007
+ vllm_xargs: dict[str, str | int | float] | None = Field(
2008
+ default=None,
2009
+ description=(
2010
+ "Additional request parameters with string or "
2011
+ "numeric values, used by custom extensions."
2012
+ ),
2013
+ )
2014
+ # --8<-- [end:transcription-extra-params]
2015
+
2016
+ to_language: str | None = None
2017
+ """The language of the output audio we transcribe to.
2018
+
2019
+ Please note that this is not currently used by supported models at this
2020
+ time, but it is a placeholder for future use, matching translation api.
2021
+ """
2022
+
2023
+ # --8<-- [start:transcription-sampling-params]
2024
+ temperature: float = Field(default=0.0)
2025
+ """The sampling temperature, between 0 and 1.
2026
+
2027
+ Higher values like 0.8 will make the output more random, while lower values
2028
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2029
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2030
+ to automatically increase the temperature until certain thresholds are hit.
2031
+ """
2032
+
2033
+ top_p: float | None = None
2034
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
2035
+ smallest possible set whose cumulative probability exceeds `p`.
2036
+ """
2037
+
2038
+ top_k: int | None = None
2039
+ """Limits sampling to the `k` most probable tokens at each step."""
2040
+
2041
+ min_p: float | None = None
2042
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
2043
+ minimum likelihood threshold during sampling.
2044
+ """
2045
+
2046
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2047
+ """The seed to use for sampling."""
2048
+
2049
+ frequency_penalty: float | None = 0.0
2050
+ """The frequency penalty to use for sampling."""
2051
+
2052
+ repetition_penalty: float | None = None
2053
+ """The repetition penalty to use for sampling."""
2054
+
2055
+ presence_penalty: float | None = 0.0
2056
+ """The presence penalty to use for sampling."""
2057
+ # --8<-- [end:transcription-sampling-params]
2058
+
2059
+ # Default sampling parameters for transcription requests.
2060
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2061
+ "repetition_penalty": 1.0,
2062
+ "temperature": 1.0,
2063
+ "top_p": 1.0,
2064
+ "top_k": 0,
2065
+ "min_p": 0.0,
2066
+ }
2067
+
2068
+ def to_sampling_params(
2069
+ self, default_max_tokens: int, default_sampling_params: dict | None = None
2070
+ ) -> SamplingParams:
2071
+ max_tokens = default_max_tokens
2072
+
2073
+ if default_sampling_params is None:
2074
+ default_sampling_params = {}
2075
+
2076
+ # Default parameters
2077
+ if (temperature := self.temperature) is None:
2078
+ temperature = default_sampling_params.get(
2079
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
2080
+ )
2081
+ if (top_p := self.top_p) is None:
2082
+ top_p = default_sampling_params.get(
2083
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
2084
+ )
2085
+ if (top_k := self.top_k) is None:
2086
+ top_k = default_sampling_params.get(
2087
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
2088
+ )
2089
+ if (min_p := self.min_p) is None:
2090
+ min_p = default_sampling_params.get(
2091
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
2092
+ )
2093
+
2094
+ if (repetition_penalty := self.repetition_penalty) is None:
2095
+ repetition_penalty = default_sampling_params.get(
2096
+ "repetition_penalty",
2097
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
2098
+ )
2099
+
2100
+ return SamplingParams.from_optional(
2101
+ temperature=temperature,
2102
+ max_tokens=max_tokens,
2103
+ seed=self.seed,
2104
+ top_p=top_p,
2105
+ top_k=top_k,
2106
+ min_p=min_p,
2107
+ frequency_penalty=self.frequency_penalty,
2108
+ repetition_penalty=repetition_penalty,
2109
+ presence_penalty=self.presence_penalty,
2110
+ output_kind=RequestOutputKind.DELTA
2111
+ if self.stream
2112
+ else RequestOutputKind.FINAL_ONLY,
2113
+ extra_args=self.vllm_xargs,
2114
+ )
2115
+
2116
+ @model_validator(mode="before")
2117
+ @classmethod
2118
+ def validate_transcription_request(cls, data):
2119
+ if isinstance(data.get("file"), str):
2120
+ raise HTTPException(
2121
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
2122
+ detail="Expected 'file' to be a file-like object, not 'str'.",
2123
+ )
2124
+
2125
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2126
+ stream = data.get("stream", False)
2127
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2128
+ raise ValueError("Stream options can only be defined when `stream=True`.")
2129
+
2130
+ return data
2131
+
2132
+
2133
+ # Transcription response objects
2134
+ class TranscriptionUsageAudio(OpenAIBaseModel):
2135
+ type: Literal["duration"] = "duration"
2136
+ seconds: int
2137
+
2138
+
2139
+ class TranscriptionResponse(OpenAIBaseModel):
2140
+ text: str
2141
+ """The transcribed text."""
2142
+ usage: TranscriptionUsageAudio
2143
+
2144
+
2145
+ class TranscriptionWord(OpenAIBaseModel):
2146
+ end: float
2147
+ """End time of the word in seconds."""
2148
+
2149
+ start: float
2150
+ """Start time of the word in seconds."""
2151
+
2152
+ word: str
2153
+ """The text content of the word."""
2154
+
2155
+
2156
+ class TranscriptionSegment(OpenAIBaseModel):
2157
+ id: int
2158
+ """Unique identifier of the segment."""
2159
+
2160
+ avg_logprob: float | None = None
2161
+ """Average logprob of the segment.
2162
+
2163
+ If the value is lower than -1, consider the logprobs failed.
2164
+ """
2165
+
2166
+ compression_ratio: float | None = None
2167
+ """Compression ratio of the segment.
2168
+
2169
+ If the value is greater than 2.4, consider the compression failed.
2170
+ """
2171
+
2172
+ end: float
2173
+ """End time of the segment in seconds."""
2174
+
2175
+ no_speech_prob: float | None = None
2176
+ """Probability of no speech in the segment.
2177
+
2178
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2179
+ this segment silent.
2180
+ """
2181
+
2182
+ seek: int
2183
+ """Seek offset of the segment."""
2184
+
2185
+ start: float
2186
+ """Start time of the segment in seconds."""
2187
+
2188
+ temperature: float
2189
+ """Temperature parameter used for generating the segment."""
2190
+
2191
+ text: str
2192
+ """Text content of the segment."""
2193
+
2194
+ tokens: list[int]
2195
+ """Array of token IDs for the text content."""
2196
+
2197
+
2198
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
2199
+ duration: str
2200
+ """The duration of the input audio."""
2201
+
2202
+ language: str
2203
+ """The language of the input audio."""
2204
+
2205
+ text: str
2206
+ """The transcribed text."""
2207
+
2208
+ segments: list[TranscriptionSegment] | None = None
2209
+ """Segments of the transcribed text and their corresponding details."""
2210
+
2211
+ words: list[TranscriptionWord] | None = None
2212
+ """Extracted words and their corresponding timestamps."""
2213
+
2214
+
2215
+ TranscriptionResponseVariant: TypeAlias = (
2216
+ TranscriptionResponse | TranscriptionResponseVerbose
2217
+ )
2218
+
2219
+
2220
+ class TranslationResponseStreamChoice(OpenAIBaseModel):
2221
+ delta: DeltaMessage
2222
+ finish_reason: str | None = None
2223
+ stop_reason: int | str | None = None
2224
+
2225
+
2226
+ class TranslationStreamResponse(OpenAIBaseModel):
2227
+ id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
2228
+ object: Literal["translation.chunk"] = "translation.chunk"
2229
+ created: int = Field(default_factory=lambda: int(time.time()))
2230
+ model: str
2231
+ choices: list[TranslationResponseStreamChoice]
2232
+ usage: UsageInfo | None = Field(default=None)
2233
+
2234
+
2235
+ class TranslationRequest(OpenAIBaseModel):
2236
+ # Ordered by official OpenAI API documentation
2237
+ # https://platform.openai.com/docs/api-reference/audio/createTranslation
2238
+
2239
+ file: UploadFile
2240
+ """
2241
+ The audio file object (not file name) to translate, in one of these
2242
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2243
+ """
2244
+
2245
+ model: str | None = None
2246
+ """ID of the model to use.
2247
+ """
2248
+
2249
+ prompt: str = Field(default="")
2250
+ """An optional text to guide the model's style or continue a previous audio
2251
+ segment.
2252
+
2253
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2254
+ should match the audio language.
2255
+ """
2256
+
2257
+ response_format: AudioResponseFormat = Field(default="json")
2258
+ """
2259
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2260
+ `verbose_json`, or `vtt`.
2261
+ """
2262
+
2263
+ # TODO support additional sampling parameters
2264
+ # --8<-- [start:translation-sampling-params]
2265
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2266
+ """The seed to use for sampling."""
2267
+
2268
+ temperature: float = Field(default=0.0)
2269
+ """The sampling temperature, between 0 and 1.
2270
+
2271
+ Higher values like 0.8 will make the output more random, while lower values
2272
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2273
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2274
+ to automatically increase the temperature until certain thresholds are hit.
2275
+ """
2276
+ # --8<-- [end:translation-sampling-params]
2277
+
2278
+ # --8<-- [start:translation-extra-params]
2279
+ language: str | None = None
2280
+ """The language of the input audio we translate from.
2281
+
2282
+ Supplying the input language in
2283
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2284
+ will improve accuracy.
2285
+ """
2286
+
2287
+ to_language: str | None = None
2288
+ """The language of the input audio we translate to.
2289
+
2290
+ Please note that this is not supported by all models, refer to the specific
2291
+ model documentation for more details.
2292
+ For instance, Whisper only supports `to_language=en`.
2293
+ """
2294
+
2295
+ stream: bool | None = False
2296
+ """Custom field not present in the original OpenAI definition. When set,
2297
+ it will enable output to be streamed in a similar fashion as the Chat
2298
+ Completion endpoint.
2299
+ """
2300
+ # Flattened stream option to simplify form data.
2301
+ stream_include_usage: bool | None = False
2302
+ stream_continuous_usage_stats: bool | None = False
2303
+ # --8<-- [end:translation-extra-params]
2304
+
2305
+ # Default sampling parameters for translation requests.
2306
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2307
+ "temperature": 0,
2308
+ }
2309
+
2310
+ def to_sampling_params(
2311
+ self, default_max_tokens: int, default_sampling_params: dict | None = None
2312
+ ) -> SamplingParams:
2313
+ max_tokens = default_max_tokens
2314
+
2315
+ if default_sampling_params is None:
2316
+ default_sampling_params = {}
2317
+ # Default parameters
2318
+ if (temperature := self.temperature) is None:
2319
+ temperature = default_sampling_params.get(
2320
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
2321
+ )
2322
+
2323
+ return SamplingParams.from_optional(
2324
+ temperature=temperature,
2325
+ max_tokens=max_tokens,
2326
+ seed=self.seed,
2327
+ output_kind=RequestOutputKind.DELTA
2328
+ if self.stream
2329
+ else RequestOutputKind.FINAL_ONLY,
2330
+ )
2331
+
2332
+ @model_validator(mode="before")
2333
+ @classmethod
2334
+ def validate_stream_options(cls, data):
2335
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2336
+ stream = data.get("stream", False)
2337
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2338
+ raise ValueError("Stream options can only be defined when `stream=True`.")
2339
+
2340
+ return data
2341
+
2342
+
2343
+ # Translation response objects
2344
+ class TranslationResponse(OpenAIBaseModel):
2345
+ text: str
2346
+ """The translated text."""
2347
+
2348
+
2349
+ class TranslationWord(OpenAIBaseModel):
2350
+ end: float
2351
+ """End time of the word in seconds."""
2352
+
2353
+ start: float
2354
+ """Start time of the word in seconds."""
2355
+
2356
+ word: str
2357
+ """The text content of the word."""
2358
+
2359
+
2360
+ class TranslationSegment(OpenAIBaseModel):
2361
+ id: int
2362
+ """Unique identifier of the segment."""
2363
+
2364
+ avg_logprob: float | None = None
2365
+ """Average logprob of the segment.
2366
+
2367
+ If the value is lower than -1, consider the logprobs failed.
2368
+ """
2369
+
2370
+ compression_ratio: float | None = None
2371
+ """Compression ratio of the segment.
2372
+
2373
+ If the value is greater than 2.4, consider the compression failed.
2374
+ """
2375
+
2376
+ end: float
2377
+ """End time of the segment in seconds."""
2378
+
2379
+ no_speech_prob: float | None = None
2380
+ """Probability of no speech in the segment.
2381
+
2382
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2383
+ this segment silent.
2384
+ """
2385
+
2386
+ seek: int
2387
+ """Seek offset of the segment."""
2388
+
2389
+ start: float
2390
+ """Start time of the segment in seconds."""
2391
+
2392
+ temperature: float
2393
+ """Temperature parameter used for generating the segment."""
2394
+
2395
+ text: str
2396
+ """Text content of the segment."""
2397
+
2398
+ tokens: list[int]
2399
+ """Array of token IDs for the text content."""
2400
+
2401
+
2402
+ class TranslationResponseVerbose(OpenAIBaseModel):
2403
+ duration: str
2404
+ """The duration of the input audio."""
2405
+
2406
+ language: str
2407
+ """The language of the input audio."""
2408
+
2409
+ text: str
2410
+ """The translated text."""
2411
+
2412
+ segments: list[TranslationSegment] | None = None
2413
+ """Segments of the translated text and their corresponding details."""
2414
+
2415
+ words: list[TranslationWord] | None = None
2416
+ """Extracted words and their corresponding timestamps."""
2417
+
2418
+
2419
+ TranslationResponseVariant: TypeAlias = TranslationResponse | TranslationResponseVerbose
2420
+
2421
+
2422
+ ####### Tokens IN <> Tokens OUT #######
2423
+ class GenerateRequest(BaseModel):
2424
+ request_id: str = Field(
2425
+ default_factory=random_uuid,
2426
+ description=(
2427
+ "The request_id related to this request. If the caller does "
2428
+ "not set it, a random_uuid will be generated. This id is used "
2429
+ "through out the inference process and return in response."
2430
+ ),
2431
+ )
2432
+ token_ids: list[int]
2433
+ """The token ids to generate text from."""
2434
+
2435
+ # features: MultiModalFeatureSpec
2436
+ # TODO (NickLucche): implement once Renderer work is completed
2437
+ features: str | None = None
2438
+ """The processed MM inputs for the model."""
2439
+
2440
+ sampling_params: SamplingParams
2441
+ """The sampling parameters for the model."""
2442
+
2443
+ model: str | None = None
2444
+
2445
+ stream: bool | None = False
2446
+ stream_options: StreamOptions | None = None
2447
+ cache_salt: str | None = Field(
2448
+ default=None,
2449
+ description=(
2450
+ "If specified, the prefix cache will be salted with the provided "
2451
+ "string to prevent an attacker to guess prompts in multi-user "
2452
+ "environments. The salt should be random, protected from "
2453
+ "access by 3rd parties, and long enough to be "
2454
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
2455
+ "to 256 bit)."
2456
+ ),
2457
+ )
2458
+ priority: int = Field(
2459
+ default=0,
2460
+ description=(
2461
+ "The priority of the request (lower means earlier handling; "
2462
+ "default: 0). Any priority other than 0 will raise an error "
2463
+ "if the served model does not use priority scheduling."
2464
+ ),
2465
+ )
2466
+ kv_transfer_params: dict[str, Any] | None = Field(
2467
+ default=None,
2468
+ description="KVTransfer parameters used for disaggregated serving.",
2469
+ )
2470
+
2471
+
2472
+ class GenerateResponseChoice(BaseModel):
2473
+ index: int
2474
+ logprobs: ChatCompletionLogProbs | None = None
2475
+ # per OpenAI spec this is the default
2476
+ finish_reason: str | None = "stop"
2477
+ token_ids: list[int] | None = None
2478
+
2479
+
2480
+ class GenerateResponse(BaseModel):
2481
+ request_id: str = Field(
2482
+ default_factory=random_uuid,
2483
+ description=(
2484
+ "The request_id related to this request. If the caller does "
2485
+ "not set it, a random_uuid will be generated. This id is used "
2486
+ "through out the inference process and return in response."
2487
+ ),
2488
+ )
2489
+ choices: list[GenerateResponseChoice]
2490
+
2491
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
2492
+
2493
+ kv_transfer_params: dict[str, Any] | None = Field(
2494
+ default=None,
2495
+ description="KVTransfer parameters used for disaggregated serving.",
2496
+ )