vllm-cpu-avx512bf16 0.14.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1712) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +1511 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +3206 -0
  6. vllm/_ipex_ops.py +445 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +62 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/layer.py +913 -0
  15. vllm/attention/utils/__init__.py +0 -0
  16. vllm/attention/utils/kv_sharing_utils.py +33 -0
  17. vllm/attention/utils/kv_transfer_utils.py +60 -0
  18. vllm/beam_search.py +88 -0
  19. vllm/benchmarks/__init__.py +0 -0
  20. vllm/benchmarks/datasets.py +3277 -0
  21. vllm/benchmarks/latency.py +172 -0
  22. vllm/benchmarks/lib/__init__.py +3 -0
  23. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  24. vllm/benchmarks/lib/ready_checker.py +72 -0
  25. vllm/benchmarks/lib/utils.py +79 -0
  26. vllm/benchmarks/mm_processor.py +363 -0
  27. vllm/benchmarks/serve.py +1761 -0
  28. vllm/benchmarks/startup.py +321 -0
  29. vllm/benchmarks/sweep/__init__.py +0 -0
  30. vllm/benchmarks/sweep/cli.py +41 -0
  31. vllm/benchmarks/sweep/param_sweep.py +159 -0
  32. vllm/benchmarks/sweep/plot.py +675 -0
  33. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  34. vllm/benchmarks/sweep/serve.py +450 -0
  35. vllm/benchmarks/sweep/serve_sla.py +459 -0
  36. vllm/benchmarks/sweep/server.py +114 -0
  37. vllm/benchmarks/sweep/sla_sweep.py +138 -0
  38. vllm/benchmarks/sweep/utils.py +4 -0
  39. vllm/benchmarks/throughput.py +946 -0
  40. vllm/collect_env.py +857 -0
  41. vllm/compilation/__init__.py +0 -0
  42. vllm/compilation/activation_quant_fusion.py +214 -0
  43. vllm/compilation/backends.py +840 -0
  44. vllm/compilation/base_static_graph.py +57 -0
  45. vllm/compilation/caching.py +196 -0
  46. vllm/compilation/collective_fusion.py +1224 -0
  47. vllm/compilation/compiler_interface.py +639 -0
  48. vllm/compilation/counter.py +50 -0
  49. vllm/compilation/cuda_graph.py +309 -0
  50. vllm/compilation/decorators.py +662 -0
  51. vllm/compilation/fix_functionalization.py +266 -0
  52. vllm/compilation/fusion.py +570 -0
  53. vllm/compilation/fusion_attn.py +363 -0
  54. vllm/compilation/fx_utils.py +92 -0
  55. vllm/compilation/inductor_pass.py +145 -0
  56. vllm/compilation/matcher_utils.py +454 -0
  57. vllm/compilation/monitor.py +62 -0
  58. vllm/compilation/noop_elimination.py +130 -0
  59. vllm/compilation/partition_rules.py +75 -0
  60. vllm/compilation/pass_manager.py +164 -0
  61. vllm/compilation/piecewise_backend.py +191 -0
  62. vllm/compilation/post_cleanup.py +21 -0
  63. vllm/compilation/qk_norm_rope_fusion.py +244 -0
  64. vllm/compilation/rocm_aiter_fusion.py +401 -0
  65. vllm/compilation/sequence_parallelism.py +368 -0
  66. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  67. vllm/compilation/vllm_inductor_pass.py +180 -0
  68. vllm/compilation/wrapper.py +329 -0
  69. vllm/config/__init__.py +112 -0
  70. vllm/config/attention.py +114 -0
  71. vllm/config/cache.py +233 -0
  72. vllm/config/compilation.py +1149 -0
  73. vllm/config/device.py +75 -0
  74. vllm/config/ec_transfer.py +110 -0
  75. vllm/config/kv_events.py +56 -0
  76. vllm/config/kv_transfer.py +119 -0
  77. vllm/config/load.py +124 -0
  78. vllm/config/lora.py +102 -0
  79. vllm/config/model.py +2026 -0
  80. vllm/config/model_arch.py +57 -0
  81. vllm/config/multimodal.py +247 -0
  82. vllm/config/observability.py +157 -0
  83. vllm/config/parallel.py +703 -0
  84. vllm/config/pooler.py +188 -0
  85. vllm/config/profiler.py +199 -0
  86. vllm/config/scheduler.py +298 -0
  87. vllm/config/speculative.py +656 -0
  88. vllm/config/speech_to_text.py +39 -0
  89. vllm/config/structured_outputs.py +78 -0
  90. vllm/config/utils.py +374 -0
  91. vllm/config/vllm.py +1487 -0
  92. vllm/connections.py +189 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +301 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +43 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +509 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +303 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +346 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +190 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  106. vllm/distributed/device_communicators/pynccl.py +386 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +567 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +778 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  113. vllm/distributed/device_communicators/symm_mem.py +156 -0
  114. vllm/distributed/device_communicators/xpu_communicator.py +98 -0
  115. vllm/distributed/ec_transfer/__init__.py +14 -0
  116. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  117. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  118. vllm/distributed/ec_transfer/ec_connector/example_connector.py +201 -0
  119. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  120. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  121. vllm/distributed/eplb/__init__.py +3 -0
  122. vllm/distributed/eplb/async_worker.py +115 -0
  123. vllm/distributed/eplb/eplb_state.py +1192 -0
  124. vllm/distributed/eplb/policy/__init__.py +19 -0
  125. vllm/distributed/eplb/policy/abstract.py +43 -0
  126. vllm/distributed/eplb/policy/default.py +376 -0
  127. vllm/distributed/eplb/rebalance_execute.py +699 -0
  128. vllm/distributed/kv_events.py +505 -0
  129. vllm/distributed/kv_transfer/README.md +29 -0
  130. vllm/distributed/kv_transfer/__init__.py +20 -0
  131. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  132. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  133. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  134. vllm/distributed/kv_transfer/kv_connector/factory.py +203 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +459 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +607 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +450 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +344 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  142. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +395 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +211 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1431 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +941 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +186 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +916 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/moriio/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +321 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +1515 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +609 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +477 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2688 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +557 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  159. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  160. vllm/distributed/parallel_state.py +1809 -0
  161. vllm/distributed/utils.py +545 -0
  162. vllm/engine/__init__.py +0 -0
  163. vllm/engine/arg_utils.py +2137 -0
  164. vllm/engine/async_llm_engine.py +6 -0
  165. vllm/engine/llm_engine.py +6 -0
  166. vllm/engine/protocol.py +194 -0
  167. vllm/entrypoints/__init__.py +0 -0
  168. vllm/entrypoints/anthropic/__init__.py +0 -0
  169. vllm/entrypoints/anthropic/protocol.py +162 -0
  170. vllm/entrypoints/anthropic/serving_messages.py +468 -0
  171. vllm/entrypoints/api_server.py +186 -0
  172. vllm/entrypoints/chat_utils.py +1912 -0
  173. vllm/entrypoints/cli/__init__.py +19 -0
  174. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/base.py +25 -0
  176. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  177. vllm/entrypoints/cli/benchmark/main.py +57 -0
  178. vllm/entrypoints/cli/benchmark/mm_processor.py +21 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  180. vllm/entrypoints/cli/benchmark/startup.py +21 -0
  181. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  182. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  183. vllm/entrypoints/cli/collect_env.py +38 -0
  184. vllm/entrypoints/cli/main.py +79 -0
  185. vllm/entrypoints/cli/openai.py +260 -0
  186. vllm/entrypoints/cli/run_batch.py +68 -0
  187. vllm/entrypoints/cli/serve.py +253 -0
  188. vllm/entrypoints/cli/types.py +29 -0
  189. vllm/entrypoints/constants.py +12 -0
  190. vllm/entrypoints/context.py +898 -0
  191. vllm/entrypoints/grpc_server.py +531 -0
  192. vllm/entrypoints/launcher.py +175 -0
  193. vllm/entrypoints/llm.py +1807 -0
  194. vllm/entrypoints/logger.py +86 -0
  195. vllm/entrypoints/openai/__init__.py +0 -0
  196. vllm/entrypoints/openai/api_server.py +1390 -0
  197. vllm/entrypoints/openai/cli_args.py +320 -0
  198. vllm/entrypoints/openai/orca_metrics.py +120 -0
  199. vllm/entrypoints/openai/parser/__init__.py +0 -0
  200. vllm/entrypoints/openai/parser/harmony_utils.py +820 -0
  201. vllm/entrypoints/openai/parser/responses_parser.py +176 -0
  202. vllm/entrypoints/openai/protocol.py +2566 -0
  203. vllm/entrypoints/openai/run_batch.py +635 -0
  204. vllm/entrypoints/openai/serving_chat.py +1897 -0
  205. vllm/entrypoints/openai/serving_chat_stream_harmony.py +101 -0
  206. vllm/entrypoints/openai/serving_completion.py +740 -0
  207. vllm/entrypoints/openai/serving_engine.py +1612 -0
  208. vllm/entrypoints/openai/serving_models.py +309 -0
  209. vllm/entrypoints/openai/serving_responses.py +2552 -0
  210. vllm/entrypoints/openai/serving_transcription.py +168 -0
  211. vllm/entrypoints/openai/speech_to_text.py +711 -0
  212. vllm/entrypoints/openai/utils.py +49 -0
  213. vllm/entrypoints/pooling/__init__.py +16 -0
  214. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  215. vllm/entrypoints/pooling/classify/api_router.py +48 -0
  216. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  217. vllm/entrypoints/pooling/classify/serving.py +233 -0
  218. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  219. vllm/entrypoints/pooling/embed/api_router.py +65 -0
  220. vllm/entrypoints/pooling/embed/conftest.py +28 -0
  221. vllm/entrypoints/pooling/embed/protocol.py +217 -0
  222. vllm/entrypoints/pooling/embed/serving.py +684 -0
  223. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  224. vllm/entrypoints/pooling/pooling/api_router.py +62 -0
  225. vllm/entrypoints/pooling/pooling/protocol.py +146 -0
  226. vllm/entrypoints/pooling/pooling/serving.py +354 -0
  227. vllm/entrypoints/pooling/score/__init__.py +0 -0
  228. vllm/entrypoints/pooling/score/api_router.py +147 -0
  229. vllm/entrypoints/pooling/score/protocol.py +146 -0
  230. vllm/entrypoints/pooling/score/serving.py +511 -0
  231. vllm/entrypoints/renderer.py +411 -0
  232. vllm/entrypoints/responses_utils.py +218 -0
  233. vllm/entrypoints/sagemaker/__init__.py +4 -0
  234. vllm/entrypoints/sagemaker/routes.py +118 -0
  235. vllm/entrypoints/score_utils.py +271 -0
  236. vllm/entrypoints/serve/__init__.py +94 -0
  237. vllm/entrypoints/serve/cache/__init__.py +0 -0
  238. vllm/entrypoints/serve/cache/api_router.py +61 -0
  239. vllm/entrypoints/serve/disagg/__init__.py +0 -0
  240. vllm/entrypoints/serve/disagg/api_router.py +109 -0
  241. vllm/entrypoints/serve/disagg/protocol.py +90 -0
  242. vllm/entrypoints/serve/disagg/serving.py +285 -0
  243. vllm/entrypoints/serve/elastic_ep/__init__.py +0 -0
  244. vllm/entrypoints/serve/elastic_ep/api_router.py +96 -0
  245. vllm/entrypoints/serve/elastic_ep/middleware.py +49 -0
  246. vllm/entrypoints/serve/instrumentator/__init__.py +0 -0
  247. vllm/entrypoints/serve/instrumentator/health.py +33 -0
  248. vllm/entrypoints/serve/instrumentator/metrics.py +45 -0
  249. vllm/entrypoints/serve/instrumentator/offline_docs.py +50 -0
  250. vllm/entrypoints/serve/instrumentator/server_info.py +56 -0
  251. vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js +2 -0
  252. vllm/entrypoints/serve/instrumentator/static/swagger-ui.css +3 -0
  253. vllm/entrypoints/serve/lora/__init__.py +0 -0
  254. vllm/entrypoints/serve/lora/api_router.py +70 -0
  255. vllm/entrypoints/serve/profile/__init__.py +0 -0
  256. vllm/entrypoints/serve/profile/api_router.py +46 -0
  257. vllm/entrypoints/serve/rlhf/__init__.py +0 -0
  258. vllm/entrypoints/serve/rlhf/api_router.py +102 -0
  259. vllm/entrypoints/serve/rpc/__init__.py +0 -0
  260. vllm/entrypoints/serve/rpc/api_router.py +61 -0
  261. vllm/entrypoints/serve/sleep/__init__.py +0 -0
  262. vllm/entrypoints/serve/sleep/api_router.py +56 -0
  263. vllm/entrypoints/serve/tokenize/__init__.py +0 -0
  264. vllm/entrypoints/serve/tokenize/api_router.py +112 -0
  265. vllm/entrypoints/serve/tokenize/serving.py +204 -0
  266. vllm/entrypoints/ssl.py +78 -0
  267. vllm/entrypoints/tool.py +187 -0
  268. vllm/entrypoints/tool_server.py +234 -0
  269. vllm/entrypoints/utils.py +336 -0
  270. vllm/env_override.py +402 -0
  271. vllm/envs.py +1791 -0
  272. vllm/exceptions.py +36 -0
  273. vllm/forward_context.py +375 -0
  274. vllm/grpc/__init__.py +17 -0
  275. vllm/grpc/compile_protos.py +94 -0
  276. vllm/grpc/vllm_engine.proto +195 -0
  277. vllm/grpc/vllm_engine_pb2.py +77 -0
  278. vllm/grpc/vllm_engine_pb2.pyi +213 -0
  279. vllm/grpc/vllm_engine_pb2_grpc.py +330 -0
  280. vllm/inputs/__init__.py +44 -0
  281. vllm/inputs/data.py +359 -0
  282. vllm/inputs/parse.py +147 -0
  283. vllm/inputs/preprocess.py +716 -0
  284. vllm/logger.py +303 -0
  285. vllm/logging_utils/__init__.py +13 -0
  286. vllm/logging_utils/dump_input.py +83 -0
  287. vllm/logging_utils/formatter.py +127 -0
  288. vllm/logging_utils/lazy.py +20 -0
  289. vllm/logging_utils/log_time.py +34 -0
  290. vllm/logits_process.py +121 -0
  291. vllm/logprobs.py +206 -0
  292. vllm/lora/__init__.py +0 -0
  293. vllm/lora/layers/__init__.py +43 -0
  294. vllm/lora/layers/base.py +66 -0
  295. vllm/lora/layers/base_linear.py +172 -0
  296. vllm/lora/layers/column_parallel_linear.py +577 -0
  297. vllm/lora/layers/fused_moe.py +739 -0
  298. vllm/lora/layers/logits_processor.py +203 -0
  299. vllm/lora/layers/replicated_linear.py +70 -0
  300. vllm/lora/layers/row_parallel_linear.py +176 -0
  301. vllm/lora/layers/utils.py +115 -0
  302. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  303. vllm/lora/lora_model.py +221 -0
  304. vllm/lora/lora_weights.py +227 -0
  305. vllm/lora/model_manager.py +858 -0
  306. vllm/lora/ops/__init__.py +0 -0
  307. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  308. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  309. vllm/lora/ops/torch_ops/__init__.py +20 -0
  310. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  311. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  312. vllm/lora/ops/triton_ops/__init__.py +21 -0
  313. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +677 -0
  314. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  315. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  316. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  317. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  318. vllm/lora/ops/triton_ops/utils.py +313 -0
  319. vllm/lora/peft_helper.py +128 -0
  320. vllm/lora/punica_wrapper/__init__.py +10 -0
  321. vllm/lora/punica_wrapper/punica_base.py +493 -0
  322. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  323. vllm/lora/punica_wrapper/punica_gpu.py +413 -0
  324. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  325. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  326. vllm/lora/punica_wrapper/utils.py +150 -0
  327. vllm/lora/request.py +60 -0
  328. vllm/lora/resolver.py +88 -0
  329. vllm/lora/utils.py +281 -0
  330. vllm/lora/worker_manager.py +278 -0
  331. vllm/model_executor/__init__.py +9 -0
  332. vllm/model_executor/custom_op.py +203 -0
  333. vllm/model_executor/layers/__init__.py +0 -0
  334. vllm/model_executor/layers/activation.py +628 -0
  335. vllm/model_executor/layers/attention/__init__.py +0 -0
  336. vllm/model_executor/layers/attention/chunked_local_attention.py +130 -0
  337. vllm/model_executor/layers/attention/cross_attention.py +182 -0
  338. vllm/model_executor/layers/attention/encoder_only_attention.py +103 -0
  339. vllm/model_executor/layers/attention/mm_encoder_attention.py +234 -0
  340. vllm/model_executor/layers/attention/static_sink_attention.py +254 -0
  341. vllm/model_executor/layers/attention_layer_base.py +34 -0
  342. vllm/model_executor/layers/batch_invariant.py +1063 -0
  343. vllm/model_executor/layers/conv.py +262 -0
  344. vllm/model_executor/layers/fla/__init__.py +8 -0
  345. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  346. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  347. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  348. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  349. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  350. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  351. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  352. vllm/model_executor/layers/fla/ops/index.py +41 -0
  353. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  354. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  355. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  356. vllm/model_executor/layers/fla/ops/op.py +60 -0
  357. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  358. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  359. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  360. vllm/model_executor/layers/fused_moe/__init__.py +120 -0
  361. vllm/model_executor/layers/fused_moe/all2all_utils.py +173 -0
  362. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +411 -0
  363. vllm/model_executor/layers/fused_moe/config.py +1111 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json +147 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  645. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  646. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  647. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  648. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  649. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  650. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  651. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +444 -0
  652. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1086 -0
  653. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +364 -0
  654. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +427 -0
  655. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  656. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +436 -0
  657. vllm/model_executor/layers/fused_moe/fallback.py +127 -0
  658. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +338 -0
  659. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +310 -0
  660. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +371 -0
  661. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  662. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1018 -0
  663. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +824 -0
  664. vllm/model_executor/layers/fused_moe/fused_moe.py +2638 -0
  665. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +119 -0
  666. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +117 -0
  667. vllm/model_executor/layers/fused_moe/fused_moe_router.py +40 -0
  668. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +531 -0
  669. vllm/model_executor/layers/fused_moe/layer.py +2169 -0
  670. vllm/model_executor/layers/fused_moe/modular_kernel.py +1251 -0
  671. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +192 -0
  672. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  673. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  674. vllm/model_executor/layers/fused_moe/oracle/__init__.py +2 -0
  675. vllm/model_executor/layers/fused_moe/oracle/fp8.py +358 -0
  676. vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +280 -0
  677. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  678. vllm/model_executor/layers/fused_moe/prepare_finalize.py +87 -0
  679. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +347 -0
  680. vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +324 -0
  681. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  682. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  683. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  684. vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py +78 -0
  685. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +75 -0
  686. vllm/model_executor/layers/fused_moe/trtllm_moe.py +144 -0
  687. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +403 -0
  688. vllm/model_executor/layers/fused_moe/utils.py +382 -0
  689. vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +189 -0
  690. vllm/model_executor/layers/kda.py +442 -0
  691. vllm/model_executor/layers/layernorm.py +451 -0
  692. vllm/model_executor/layers/lightning_attn.py +735 -0
  693. vllm/model_executor/layers/linear.py +1478 -0
  694. vllm/model_executor/layers/logits_processor.py +109 -0
  695. vllm/model_executor/layers/mamba/__init__.py +0 -0
  696. vllm/model_executor/layers/mamba/abstract.py +68 -0
  697. vllm/model_executor/layers/mamba/linear_attn.py +410 -0
  698. vllm/model_executor/layers/mamba/mamba_mixer.py +541 -0
  699. vllm/model_executor/layers/mamba/mamba_mixer2.py +936 -0
  700. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  701. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  702. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  703. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  704. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +586 -0
  705. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  706. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  707. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  708. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  709. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  710. vllm/model_executor/layers/mamba/short_conv.py +254 -0
  711. vllm/model_executor/layers/mla.py +179 -0
  712. vllm/model_executor/layers/pooler/__init__.py +5 -0
  713. vllm/model_executor/layers/pooler/abstract.py +39 -0
  714. vllm/model_executor/layers/pooler/activations.py +162 -0
  715. vllm/model_executor/layers/pooler/common.py +32 -0
  716. vllm/model_executor/layers/pooler/seqwise/__init__.py +45 -0
  717. vllm/model_executor/layers/pooler/seqwise/heads.py +151 -0
  718. vllm/model_executor/layers/pooler/seqwise/methods.py +93 -0
  719. vllm/model_executor/layers/pooler/seqwise/poolers.py +127 -0
  720. vllm/model_executor/layers/pooler/special.py +128 -0
  721. vllm/model_executor/layers/pooler/tokwise/__init__.py +39 -0
  722. vllm/model_executor/layers/pooler/tokwise/heads.py +133 -0
  723. vllm/model_executor/layers/pooler/tokwise/methods.py +122 -0
  724. vllm/model_executor/layers/pooler/tokwise/poolers.py +127 -0
  725. vllm/model_executor/layers/quantization/__init__.py +195 -0
  726. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  727. vllm/model_executor/layers/quantization/awq.py +277 -0
  728. vllm/model_executor/layers/quantization/awq_marlin.py +795 -0
  729. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  730. vllm/model_executor/layers/quantization/base_config.py +170 -0
  731. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  732. vllm/model_executor/layers/quantization/bitsandbytes.py +631 -0
  733. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  734. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +982 -0
  735. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2368 -0
  736. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +37 -0
  737. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  738. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  739. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  740. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py +106 -0
  741. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  742. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  743. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +176 -0
  744. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  745. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  746. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +203 -0
  747. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  748. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  749. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  750. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  751. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  752. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  753. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  754. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  755. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  756. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  757. vllm/model_executor/layers/quantization/cpu_wna16.py +299 -0
  758. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  759. vllm/model_executor/layers/quantization/experts_int8.py +209 -0
  760. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  761. vllm/model_executor/layers/quantization/fp8.py +1224 -0
  762. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  763. vllm/model_executor/layers/quantization/gguf.py +682 -0
  764. vllm/model_executor/layers/quantization/gptq.py +393 -0
  765. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  766. vllm/model_executor/layers/quantization/gptq_marlin.py +934 -0
  767. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  768. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  769. vllm/model_executor/layers/quantization/inc.py +65 -0
  770. vllm/model_executor/layers/quantization/input_quant_fp8.py +212 -0
  771. vllm/model_executor/layers/quantization/ipex_quant.py +403 -0
  772. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  773. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  774. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +113 -0
  775. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  776. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  777. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  778. vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py +126 -0
  779. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +130 -0
  780. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  781. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +168 -0
  782. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  783. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  784. vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py +97 -0
  785. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +76 -0
  786. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +77 -0
  787. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +128 -0
  788. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +220 -0
  789. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +147 -0
  790. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +88 -0
  791. vllm/model_executor/layers/quantization/kv_cache.py +153 -0
  792. vllm/model_executor/layers/quantization/modelopt.py +1665 -0
  793. vllm/model_executor/layers/quantization/moe_wna16.py +518 -0
  794. vllm/model_executor/layers/quantization/mxfp4.py +1145 -0
  795. vllm/model_executor/layers/quantization/petit.py +319 -0
  796. vllm/model_executor/layers/quantization/ptpc_fp8.py +140 -0
  797. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  798. vllm/model_executor/layers/quantization/quark/quark.py +570 -0
  799. vllm/model_executor/layers/quantization/quark/quark_moe.py +797 -0
  800. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  801. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  802. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  803. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  804. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  805. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  806. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  807. vllm/model_executor/layers/quantization/rtn.py +626 -0
  808. vllm/model_executor/layers/quantization/schema.py +90 -0
  809. vllm/model_executor/layers/quantization/torchao.py +380 -0
  810. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  811. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  812. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1002. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1003. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1004. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1005. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1006. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1007. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1008. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1009. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1010. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1011. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1012. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1013. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1014. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1015. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1016. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1017. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1018. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1019. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1020. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1021. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1022. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1023. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1024. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1025. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1026. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1027. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1028. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +514 -0
  1029. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +370 -0
  1030. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1658 -0
  1031. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1032. vllm/model_executor/layers/quantization/utils/int8_utils.py +477 -0
  1033. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1034. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1035. vllm/model_executor/layers/quantization/utils/marlin_utils.py +720 -0
  1036. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +565 -0
  1037. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1038. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1039. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1040. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +189 -0
  1041. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1042. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1043. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1044. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1045. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1046. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1047. vllm/model_executor/layers/quantization/utils/quant_utils.py +767 -0
  1048. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +519 -0
  1049. vllm/model_executor/layers/resampler.py +283 -0
  1050. vllm/model_executor/layers/rotary_embedding/__init__.py +291 -0
  1051. vllm/model_executor/layers/rotary_embedding/base.py +282 -0
  1052. vllm/model_executor/layers/rotary_embedding/common.py +289 -0
  1053. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +184 -0
  1054. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +218 -0
  1055. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1056. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1057. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +82 -0
  1058. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1059. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1060. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +83 -0
  1061. vllm/model_executor/layers/rotary_embedding/mrope.py +412 -0
  1062. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1063. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1064. vllm/model_executor/layers/rotary_embedding/xdrope.py +160 -0
  1065. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1066. vllm/model_executor/layers/utils.py +251 -0
  1067. vllm/model_executor/layers/vocab_parallel_embedding.py +564 -0
  1068. vllm/model_executor/model_loader/__init__.py +150 -0
  1069. vllm/model_executor/model_loader/base_loader.py +71 -0
  1070. vllm/model_executor/model_loader/bitsandbytes_loader.py +821 -0
  1071. vllm/model_executor/model_loader/default_loader.py +304 -0
  1072. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1073. vllm/model_executor/model_loader/gguf_loader.py +371 -0
  1074. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1075. vllm/model_executor/model_loader/runai_streamer_loader.py +115 -0
  1076. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1077. vllm/model_executor/model_loader/tensorizer.py +793 -0
  1078. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1079. vllm/model_executor/model_loader/utils.py +299 -0
  1080. vllm/model_executor/model_loader/weight_utils.py +1183 -0
  1081. vllm/model_executor/models/__init__.py +44 -0
  1082. vllm/model_executor/models/adapters.py +592 -0
  1083. vllm/model_executor/models/afmoe.py +697 -0
  1084. vllm/model_executor/models/aimv2.py +248 -0
  1085. vllm/model_executor/models/apertus.py +567 -0
  1086. vllm/model_executor/models/arcee.py +428 -0
  1087. vllm/model_executor/models/arctic.py +633 -0
  1088. vllm/model_executor/models/aria.py +663 -0
  1089. vllm/model_executor/models/audioflamingo3.py +639 -0
  1090. vllm/model_executor/models/aya_vision.py +448 -0
  1091. vllm/model_executor/models/bagel.py +591 -0
  1092. vllm/model_executor/models/baichuan.py +493 -0
  1093. vllm/model_executor/models/bailing_moe.py +643 -0
  1094. vllm/model_executor/models/bamba.py +511 -0
  1095. vllm/model_executor/models/bee.py +157 -0
  1096. vllm/model_executor/models/bert.py +911 -0
  1097. vllm/model_executor/models/bert_with_rope.py +729 -0
  1098. vllm/model_executor/models/blip.py +350 -0
  1099. vllm/model_executor/models/blip2.py +736 -0
  1100. vllm/model_executor/models/bloom.py +390 -0
  1101. vllm/model_executor/models/chameleon.py +1095 -0
  1102. vllm/model_executor/models/chatglm.py +502 -0
  1103. vllm/model_executor/models/clip.py +1045 -0
  1104. vllm/model_executor/models/cohere2_vision.py +470 -0
  1105. vllm/model_executor/models/commandr.py +469 -0
  1106. vllm/model_executor/models/config.py +571 -0
  1107. vllm/model_executor/models/dbrx.py +484 -0
  1108. vllm/model_executor/models/deepencoder.py +679 -0
  1109. vllm/model_executor/models/deepseek_eagle.py +253 -0
  1110. vllm/model_executor/models/deepseek_mtp.py +447 -0
  1111. vllm/model_executor/models/deepseek_ocr.py +601 -0
  1112. vllm/model_executor/models/deepseek_v2.py +1727 -0
  1113. vllm/model_executor/models/deepseek_vl2.py +642 -0
  1114. vllm/model_executor/models/dots1.py +566 -0
  1115. vllm/model_executor/models/dots_ocr.py +830 -0
  1116. vllm/model_executor/models/ernie45.py +53 -0
  1117. vllm/model_executor/models/ernie45_moe.py +755 -0
  1118. vllm/model_executor/models/ernie45_vl.py +1702 -0
  1119. vllm/model_executor/models/ernie45_vl_moe.py +801 -0
  1120. vllm/model_executor/models/ernie_mtp.py +278 -0
  1121. vllm/model_executor/models/exaone.py +524 -0
  1122. vllm/model_executor/models/exaone4.py +518 -0
  1123. vllm/model_executor/models/exaone_moe.py +579 -0
  1124. vllm/model_executor/models/exaone_moe_mtp.py +255 -0
  1125. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1126. vllm/model_executor/models/falcon.py +543 -0
  1127. vllm/model_executor/models/falcon_h1.py +675 -0
  1128. vllm/model_executor/models/flex_olmo.py +155 -0
  1129. vllm/model_executor/models/fuyu.py +371 -0
  1130. vllm/model_executor/models/gemma.py +425 -0
  1131. vllm/model_executor/models/gemma2.py +435 -0
  1132. vllm/model_executor/models/gemma3.py +520 -0
  1133. vllm/model_executor/models/gemma3_mm.py +664 -0
  1134. vllm/model_executor/models/gemma3n.py +1166 -0
  1135. vllm/model_executor/models/gemma3n_audio_utils.py +57 -0
  1136. vllm/model_executor/models/gemma3n_mm.py +820 -0
  1137. vllm/model_executor/models/glm.py +24 -0
  1138. vllm/model_executor/models/glm4.py +295 -0
  1139. vllm/model_executor/models/glm4_1v.py +1823 -0
  1140. vllm/model_executor/models/glm4_moe.py +725 -0
  1141. vllm/model_executor/models/glm4_moe_mtp.py +365 -0
  1142. vllm/model_executor/models/glm4v.py +783 -0
  1143. vllm/model_executor/models/glmasr.py +1154 -0
  1144. vllm/model_executor/models/glmasr_utils.py +188 -0
  1145. vllm/model_executor/models/gpt2.py +385 -0
  1146. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1147. vllm/model_executor/models/gpt_j.py +346 -0
  1148. vllm/model_executor/models/gpt_neox.py +340 -0
  1149. vllm/model_executor/models/gpt_oss.py +745 -0
  1150. vllm/model_executor/models/granite.py +475 -0
  1151. vllm/model_executor/models/granite_speech.py +919 -0
  1152. vllm/model_executor/models/granitemoe.py +561 -0
  1153. vllm/model_executor/models/granitemoehybrid.py +703 -0
  1154. vllm/model_executor/models/granitemoeshared.py +328 -0
  1155. vllm/model_executor/models/gritlm.py +242 -0
  1156. vllm/model_executor/models/grok1.py +803 -0
  1157. vllm/model_executor/models/h2ovl.py +554 -0
  1158. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1159. vllm/model_executor/models/hunyuan_vision.py +1034 -0
  1160. vllm/model_executor/models/hyperclovax_vision.py +1163 -0
  1161. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1162. vllm/model_executor/models/idefics3.py +734 -0
  1163. vllm/model_executor/models/interfaces.py +1180 -0
  1164. vllm/model_executor/models/interfaces_base.py +252 -0
  1165. vllm/model_executor/models/intern_vit.py +454 -0
  1166. vllm/model_executor/models/internlm2.py +451 -0
  1167. vllm/model_executor/models/internlm2_ve.py +139 -0
  1168. vllm/model_executor/models/interns1.py +828 -0
  1169. vllm/model_executor/models/interns1_vit.py +433 -0
  1170. vllm/model_executor/models/internvl.py +1436 -0
  1171. vllm/model_executor/models/iquest_loopcoder.py +595 -0
  1172. vllm/model_executor/models/isaac.py +1503 -0
  1173. vllm/model_executor/models/jais.py +397 -0
  1174. vllm/model_executor/models/jais2.py +508 -0
  1175. vllm/model_executor/models/jamba.py +599 -0
  1176. vllm/model_executor/models/jina_vl.py +145 -0
  1177. vllm/model_executor/models/kanana_v.py +756 -0
  1178. vllm/model_executor/models/keye.py +1709 -0
  1179. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1180. vllm/model_executor/models/kimi_linear.py +659 -0
  1181. vllm/model_executor/models/kimi_vl.py +577 -0
  1182. vllm/model_executor/models/lfm2.py +515 -0
  1183. vllm/model_executor/models/lfm2_moe.py +746 -0
  1184. vllm/model_executor/models/lfm2_vl.py +732 -0
  1185. vllm/model_executor/models/lightonocr.py +197 -0
  1186. vllm/model_executor/models/llama.py +724 -0
  1187. vllm/model_executor/models/llama4.py +860 -0
  1188. vllm/model_executor/models/llama4_eagle.py +225 -0
  1189. vllm/model_executor/models/llama_eagle.py +213 -0
  1190. vllm/model_executor/models/llama_eagle3.py +375 -0
  1191. vllm/model_executor/models/llava.py +879 -0
  1192. vllm/model_executor/models/llava_next.py +583 -0
  1193. vllm/model_executor/models/llava_next_video.py +467 -0
  1194. vllm/model_executor/models/llava_onevision.py +922 -0
  1195. vllm/model_executor/models/longcat_flash.py +767 -0
  1196. vllm/model_executor/models/longcat_flash_mtp.py +348 -0
  1197. vllm/model_executor/models/mamba.py +276 -0
  1198. vllm/model_executor/models/mamba2.py +288 -0
  1199. vllm/model_executor/models/medusa.py +179 -0
  1200. vllm/model_executor/models/midashenglm.py +826 -0
  1201. vllm/model_executor/models/mimo.py +188 -0
  1202. vllm/model_executor/models/mimo_mtp.py +294 -0
  1203. vllm/model_executor/models/mimo_v2_flash.py +718 -0
  1204. vllm/model_executor/models/minicpm.py +660 -0
  1205. vllm/model_executor/models/minicpm3.py +233 -0
  1206. vllm/model_executor/models/minicpm_eagle.py +386 -0
  1207. vllm/model_executor/models/minicpmo.py +768 -0
  1208. vllm/model_executor/models/minicpmv.py +1742 -0
  1209. vllm/model_executor/models/minimax_m2.py +552 -0
  1210. vllm/model_executor/models/minimax_text_01.py +1008 -0
  1211. vllm/model_executor/models/minimax_vl_01.py +395 -0
  1212. vllm/model_executor/models/mistral3.py +638 -0
  1213. vllm/model_executor/models/mistral_large_3.py +63 -0
  1214. vllm/model_executor/models/mistral_large_3_eagle.py +137 -0
  1215. vllm/model_executor/models/mixtral.py +599 -0
  1216. vllm/model_executor/models/mllama4.py +1170 -0
  1217. vllm/model_executor/models/mlp_speculator.py +235 -0
  1218. vllm/model_executor/models/modernbert.py +458 -0
  1219. vllm/model_executor/models/module_mapping.py +74 -0
  1220. vllm/model_executor/models/molmo.py +1592 -0
  1221. vllm/model_executor/models/moonvit.py +601 -0
  1222. vllm/model_executor/models/mpt.py +335 -0
  1223. vllm/model_executor/models/nano_nemotron_vl.py +1725 -0
  1224. vllm/model_executor/models/nemotron.py +499 -0
  1225. vllm/model_executor/models/nemotron_h.py +902 -0
  1226. vllm/model_executor/models/nemotron_nas.py +474 -0
  1227. vllm/model_executor/models/nemotron_parse.py +958 -0
  1228. vllm/model_executor/models/nemotron_vl.py +651 -0
  1229. vllm/model_executor/models/nvlm_d.py +216 -0
  1230. vllm/model_executor/models/olmo.py +412 -0
  1231. vllm/model_executor/models/olmo2.py +454 -0
  1232. vllm/model_executor/models/olmoe.py +498 -0
  1233. vllm/model_executor/models/opencua.py +262 -0
  1234. vllm/model_executor/models/openpangu.py +1378 -0
  1235. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1236. vllm/model_executor/models/opt.py +426 -0
  1237. vllm/model_executor/models/orion.py +365 -0
  1238. vllm/model_executor/models/ouro.py +507 -0
  1239. vllm/model_executor/models/ovis.py +557 -0
  1240. vllm/model_executor/models/ovis2_5.py +661 -0
  1241. vllm/model_executor/models/paddleocr_vl.py +1261 -0
  1242. vllm/model_executor/models/paligemma.py +429 -0
  1243. vllm/model_executor/models/persimmon.py +373 -0
  1244. vllm/model_executor/models/phi.py +363 -0
  1245. vllm/model_executor/models/phi3.py +18 -0
  1246. vllm/model_executor/models/phi3v.py +729 -0
  1247. vllm/model_executor/models/phi4mm.py +1250 -0
  1248. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1249. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1250. vllm/model_executor/models/phimoe.py +671 -0
  1251. vllm/model_executor/models/pixtral.py +1437 -0
  1252. vllm/model_executor/models/plamo2.py +993 -0
  1253. vllm/model_executor/models/plamo3.py +437 -0
  1254. vllm/model_executor/models/qwen.py +377 -0
  1255. vllm/model_executor/models/qwen2.py +600 -0
  1256. vllm/model_executor/models/qwen2_5_omni_thinker.py +1200 -0
  1257. vllm/model_executor/models/qwen2_5_vl.py +1598 -0
  1258. vllm/model_executor/models/qwen2_audio.py +478 -0
  1259. vllm/model_executor/models/qwen2_moe.py +604 -0
  1260. vllm/model_executor/models/qwen2_rm.py +120 -0
  1261. vllm/model_executor/models/qwen2_vl.py +1588 -0
  1262. vllm/model_executor/models/qwen3.py +331 -0
  1263. vllm/model_executor/models/qwen3_moe.py +752 -0
  1264. vllm/model_executor/models/qwen3_next.py +1410 -0
  1265. vllm/model_executor/models/qwen3_next_mtp.py +293 -0
  1266. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1814 -0
  1267. vllm/model_executor/models/qwen3_vl.py +2120 -0
  1268. vllm/model_executor/models/qwen3_vl_moe.py +474 -0
  1269. vllm/model_executor/models/qwen_vl.py +821 -0
  1270. vllm/model_executor/models/radio.py +573 -0
  1271. vllm/model_executor/models/registry.py +1218 -0
  1272. vllm/model_executor/models/roberta.py +239 -0
  1273. vllm/model_executor/models/rvl.py +107 -0
  1274. vllm/model_executor/models/seed_oss.py +492 -0
  1275. vllm/model_executor/models/siglip.py +1259 -0
  1276. vllm/model_executor/models/siglip2.py +495 -0
  1277. vllm/model_executor/models/siglip2navit.py +660 -0
  1278. vllm/model_executor/models/skyworkr1v.py +951 -0
  1279. vllm/model_executor/models/smolvlm.py +38 -0
  1280. vllm/model_executor/models/solar.py +484 -0
  1281. vllm/model_executor/models/stablelm.py +354 -0
  1282. vllm/model_executor/models/starcoder2.py +365 -0
  1283. vllm/model_executor/models/step3_text.py +554 -0
  1284. vllm/model_executor/models/step3_vl.py +1147 -0
  1285. vllm/model_executor/models/swin.py +500 -0
  1286. vllm/model_executor/models/tarsier.py +624 -0
  1287. vllm/model_executor/models/telechat2.py +153 -0
  1288. vllm/model_executor/models/teleflm.py +78 -0
  1289. vllm/model_executor/models/terratorch.py +318 -0
  1290. vllm/model_executor/models/transformers/__init__.py +127 -0
  1291. vllm/model_executor/models/transformers/base.py +523 -0
  1292. vllm/model_executor/models/transformers/causal.py +65 -0
  1293. vllm/model_executor/models/transformers/legacy.py +90 -0
  1294. vllm/model_executor/models/transformers/moe.py +329 -0
  1295. vllm/model_executor/models/transformers/multimodal.py +441 -0
  1296. vllm/model_executor/models/transformers/pooling.py +102 -0
  1297. vllm/model_executor/models/transformers/utils.py +253 -0
  1298. vllm/model_executor/models/ultravox.py +786 -0
  1299. vllm/model_executor/models/utils.py +832 -0
  1300. vllm/model_executor/models/vision.py +546 -0
  1301. vllm/model_executor/models/voxtral.py +867 -0
  1302. vllm/model_executor/models/voxtral_streaming.py +304 -0
  1303. vllm/model_executor/models/whisper.py +993 -0
  1304. vllm/model_executor/models/whisper_utils.py +299 -0
  1305. vllm/model_executor/models/zamba2.py +986 -0
  1306. vllm/model_executor/parameter.py +642 -0
  1307. vllm/model_executor/utils.py +113 -0
  1308. vllm/model_executor/warmup/__init__.py +0 -0
  1309. vllm/model_executor/warmup/deep_gemm_warmup.py +371 -0
  1310. vllm/model_executor/warmup/kernel_warmup.py +97 -0
  1311. vllm/model_inspection.py +136 -0
  1312. vllm/multimodal/__init__.py +38 -0
  1313. vllm/multimodal/audio.py +287 -0
  1314. vllm/multimodal/base.py +60 -0
  1315. vllm/multimodal/cache.py +829 -0
  1316. vllm/multimodal/evs.py +294 -0
  1317. vllm/multimodal/hasher.py +123 -0
  1318. vllm/multimodal/image.py +155 -0
  1319. vllm/multimodal/inputs.py +1027 -0
  1320. vllm/multimodal/parse.py +674 -0
  1321. vllm/multimodal/processing.py +2469 -0
  1322. vllm/multimodal/profiling.py +351 -0
  1323. vllm/multimodal/registry.py +375 -0
  1324. vllm/multimodal/utils.py +550 -0
  1325. vllm/multimodal/video.py +512 -0
  1326. vllm/outputs.py +347 -0
  1327. vllm/platforms/__init__.py +277 -0
  1328. vllm/platforms/cpu.py +423 -0
  1329. vllm/platforms/cuda.py +618 -0
  1330. vllm/platforms/interface.py +707 -0
  1331. vllm/platforms/rocm.py +586 -0
  1332. vllm/platforms/tpu.py +20 -0
  1333. vllm/platforms/xpu.py +262 -0
  1334. vllm/plugins/__init__.py +81 -0
  1335. vllm/plugins/io_processors/__init__.py +68 -0
  1336. vllm/plugins/io_processors/interface.py +77 -0
  1337. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1338. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1339. vllm/pooling_params.py +229 -0
  1340. vllm/profiler/__init__.py +0 -0
  1341. vllm/profiler/layerwise_profile.py +392 -0
  1342. vllm/profiler/utils.py +151 -0
  1343. vllm/profiler/wrapper.py +241 -0
  1344. vllm/py.typed +2 -0
  1345. vllm/ray/__init__.py +0 -0
  1346. vllm/ray/lazy_utils.py +30 -0
  1347. vllm/ray/ray_env.py +79 -0
  1348. vllm/reasoning/__init__.py +96 -0
  1349. vllm/reasoning/abs_reasoning_parsers.py +318 -0
  1350. vllm/reasoning/basic_parsers.py +175 -0
  1351. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1352. vllm/reasoning/deepseek_v3_reasoning_parser.py +69 -0
  1353. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1354. vllm/reasoning/glm4_moe_reasoning_parser.py +13 -0
  1355. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1356. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1357. vllm/reasoning/holo2_reasoning_parser.py +89 -0
  1358. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1359. vllm/reasoning/identity_reasoning_parser.py +63 -0
  1360. vllm/reasoning/minimax_m2_reasoning_parser.py +110 -0
  1361. vllm/reasoning/mistral_reasoning_parser.py +154 -0
  1362. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1363. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1364. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1365. vllm/reasoning/step3_reasoning_parser.py +113 -0
  1366. vllm/sampling_params.py +629 -0
  1367. vllm/scalar_type.py +355 -0
  1368. vllm/scripts.py +17 -0
  1369. vllm/sequence.py +64 -0
  1370. vllm/tasks.py +13 -0
  1371. vllm/third_party/__init__.py +0 -0
  1372. vllm/third_party/pynvml.py +6140 -0
  1373. vllm/tokenizers/__init__.py +18 -0
  1374. vllm/tokenizers/deepseek_v32.py +187 -0
  1375. vllm/tokenizers/deepseek_v32_encoding.py +463 -0
  1376. vllm/tokenizers/detokenizer_utils.py +198 -0
  1377. vllm/tokenizers/grok2.py +443 -0
  1378. vllm/tokenizers/hf.py +119 -0
  1379. vllm/tokenizers/mistral.py +543 -0
  1380. vllm/tokenizers/protocol.py +123 -0
  1381. vllm/tokenizers/registry.py +238 -0
  1382. vllm/tool_parsers/__init__.py +158 -0
  1383. vllm/tool_parsers/abstract_tool_parser.py +274 -0
  1384. vllm/tool_parsers/deepseekv31_tool_parser.py +388 -0
  1385. vllm/tool_parsers/deepseekv32_tool_parser.py +591 -0
  1386. vllm/tool_parsers/deepseekv3_tool_parser.py +390 -0
  1387. vllm/tool_parsers/ernie45_tool_parser.py +210 -0
  1388. vllm/tool_parsers/functiongemma_tool_parser.py +321 -0
  1389. vllm/tool_parsers/gigachat3_tool_parser.py +190 -0
  1390. vllm/tool_parsers/glm47_moe_tool_parser.py +23 -0
  1391. vllm/tool_parsers/glm4_moe_tool_parser.py +215 -0
  1392. vllm/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  1393. vllm/tool_parsers/granite_tool_parser.py +253 -0
  1394. vllm/tool_parsers/hermes_tool_parser.py +495 -0
  1395. vllm/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  1396. vllm/tool_parsers/internlm2_tool_parser.py +227 -0
  1397. vllm/tool_parsers/jamba_tool_parser.py +323 -0
  1398. vllm/tool_parsers/kimi_k2_tool_parser.py +598 -0
  1399. vllm/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  1400. vllm/tool_parsers/llama_tool_parser.py +324 -0
  1401. vllm/tool_parsers/longcat_tool_parser.py +37 -0
  1402. vllm/tool_parsers/minimax_m2_tool_parser.py +776 -0
  1403. vllm/tool_parsers/minimax_tool_parser.py +849 -0
  1404. vllm/tool_parsers/mistral_tool_parser.py +612 -0
  1405. vllm/tool_parsers/olmo3_tool_parser.py +366 -0
  1406. vllm/tool_parsers/openai_tool_parser.py +111 -0
  1407. vllm/tool_parsers/phi4mini_tool_parser.py +120 -0
  1408. vllm/tool_parsers/pythonic_tool_parser.py +332 -0
  1409. vllm/tool_parsers/qwen3coder_tool_parser.py +781 -0
  1410. vllm/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  1411. vllm/tool_parsers/seed_oss_tool_parser.py +744 -0
  1412. vllm/tool_parsers/step3_tool_parser.py +303 -0
  1413. vllm/tool_parsers/utils.py +229 -0
  1414. vllm/tool_parsers/xlam_tool_parser.py +556 -0
  1415. vllm/tracing.py +135 -0
  1416. vllm/transformers_utils/__init__.py +26 -0
  1417. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1418. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1419. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1420. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1421. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1422. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1423. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1424. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1425. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1426. vllm/transformers_utils/config.py +1169 -0
  1427. vllm/transformers_utils/config_parser_base.py +20 -0
  1428. vllm/transformers_utils/configs/__init__.py +106 -0
  1429. vllm/transformers_utils/configs/afmoe.py +87 -0
  1430. vllm/transformers_utils/configs/arctic.py +216 -0
  1431. vllm/transformers_utils/configs/bagel.py +53 -0
  1432. vllm/transformers_utils/configs/chatglm.py +75 -0
  1433. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1434. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1435. vllm/transformers_utils/configs/eagle.py +90 -0
  1436. vllm/transformers_utils/configs/falcon.py +89 -0
  1437. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1438. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1439. vllm/transformers_utils/configs/isaac.py +100 -0
  1440. vllm/transformers_utils/configs/jais.py +243 -0
  1441. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1442. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1443. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1444. vllm/transformers_utils/configs/medusa.py +65 -0
  1445. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1446. vllm/transformers_utils/configs/mistral.py +263 -0
  1447. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1448. vllm/transformers_utils/configs/moonvit.py +33 -0
  1449. vllm/transformers_utils/configs/nemotron.py +220 -0
  1450. vllm/transformers_utils/configs/nemotron_h.py +284 -0
  1451. vllm/transformers_utils/configs/olmo3.py +83 -0
  1452. vllm/transformers_utils/configs/ovis.py +182 -0
  1453. vllm/transformers_utils/configs/qwen3_next.py +277 -0
  1454. vllm/transformers_utils/configs/radio.py +98 -0
  1455. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1456. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1457. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1458. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1459. vllm/transformers_utils/configs/tarsier2.py +24 -0
  1460. vllm/transformers_utils/configs/ultravox.py +120 -0
  1461. vllm/transformers_utils/dynamic_module.py +70 -0
  1462. vllm/transformers_utils/gguf_utils.py +280 -0
  1463. vllm/transformers_utils/model_arch_config_convertor.py +402 -0
  1464. vllm/transformers_utils/processor.py +424 -0
  1465. vllm/transformers_utils/processors/__init__.py +25 -0
  1466. vllm/transformers_utils/processors/bagel.py +78 -0
  1467. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1468. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1469. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1470. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1471. vllm/transformers_utils/processors/ovis.py +453 -0
  1472. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1473. vllm/transformers_utils/repo_utils.py +287 -0
  1474. vllm/transformers_utils/runai_utils.py +102 -0
  1475. vllm/transformers_utils/s3_utils.py +95 -0
  1476. vllm/transformers_utils/tokenizer.py +19 -0
  1477. vllm/transformers_utils/utils.py +112 -0
  1478. vllm/triton_utils/__init__.py +20 -0
  1479. vllm/triton_utils/importing.py +103 -0
  1480. vllm/usage/__init__.py +0 -0
  1481. vllm/usage/usage_lib.py +278 -0
  1482. vllm/utils/__init__.py +36 -0
  1483. vllm/utils/argparse_utils.py +491 -0
  1484. vllm/utils/async_utils.py +310 -0
  1485. vllm/utils/cache.py +214 -0
  1486. vllm/utils/collection_utils.py +112 -0
  1487. vllm/utils/counter.py +45 -0
  1488. vllm/utils/deep_gemm.py +424 -0
  1489. vllm/utils/flashinfer.py +602 -0
  1490. vllm/utils/func_utils.py +236 -0
  1491. vllm/utils/gc_utils.py +151 -0
  1492. vllm/utils/hashing.py +117 -0
  1493. vllm/utils/import_utils.py +438 -0
  1494. vllm/utils/jsontree.py +158 -0
  1495. vllm/utils/math_utils.py +32 -0
  1496. vllm/utils/mem_constants.py +13 -0
  1497. vllm/utils/mem_utils.py +285 -0
  1498. vllm/utils/nccl.py +64 -0
  1499. vllm/utils/network_utils.py +331 -0
  1500. vllm/utils/nvtx_pytorch_hooks.py +286 -0
  1501. vllm/utils/platform_utils.py +59 -0
  1502. vllm/utils/profiling.py +56 -0
  1503. vllm/utils/registry.py +51 -0
  1504. vllm/utils/serial_utils.py +214 -0
  1505. vllm/utils/system_utils.py +296 -0
  1506. vllm/utils/tensor_schema.py +255 -0
  1507. vllm/utils/torch_utils.py +781 -0
  1508. vllm/v1/__init__.py +0 -0
  1509. vllm/v1/attention/__init__.py +0 -0
  1510. vllm/v1/attention/backend.py +736 -0
  1511. vllm/v1/attention/backends/__init__.py +0 -0
  1512. vllm/v1/attention/backends/cpu_attn.py +501 -0
  1513. vllm/v1/attention/backends/fa_utils.py +126 -0
  1514. vllm/v1/attention/backends/flash_attn.py +1092 -0
  1515. vllm/v1/attention/backends/flash_attn_diffkv.py +277 -0
  1516. vllm/v1/attention/backends/flashinfer.py +1713 -0
  1517. vllm/v1/attention/backends/flex_attention.py +1024 -0
  1518. vllm/v1/attention/backends/gdn_attn.py +382 -0
  1519. vllm/v1/attention/backends/linear_attn.py +77 -0
  1520. vllm/v1/attention/backends/mamba1_attn.py +28 -0
  1521. vllm/v1/attention/backends/mamba2_attn.py +256 -0
  1522. vllm/v1/attention/backends/mamba_attn.py +313 -0
  1523. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1524. vllm/v1/attention/backends/mla/aiter_triton_mla.py +66 -0
  1525. vllm/v1/attention/backends/mla/common.py +2156 -0
  1526. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1527. vllm/v1/attention/backends/mla/flashattn_mla.py +348 -0
  1528. vllm/v1/attention/backends/mla/flashinfer_mla.py +175 -0
  1529. vllm/v1/attention/backends/mla/flashmla.py +321 -0
  1530. vllm/v1/attention/backends/mla/flashmla_sparse.py +1021 -0
  1531. vllm/v1/attention/backends/mla/indexer.py +345 -0
  1532. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +284 -0
  1533. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +321 -0
  1534. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1535. vllm/v1/attention/backends/registry.py +258 -0
  1536. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1537. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1538. vllm/v1/attention/backends/rocm_attn.py +405 -0
  1539. vllm/v1/attention/backends/short_conv_attn.py +26 -0
  1540. vllm/v1/attention/backends/tree_attn.py +430 -0
  1541. vllm/v1/attention/backends/triton_attn.py +578 -0
  1542. vllm/v1/attention/backends/utils.py +978 -0
  1543. vllm/v1/attention/ops/__init__.py +0 -0
  1544. vllm/v1/attention/ops/chunked_prefill_paged_decode.py +459 -0
  1545. vllm/v1/attention/ops/common.py +469 -0
  1546. vllm/v1/attention/ops/flashmla.py +254 -0
  1547. vllm/v1/attention/ops/merge_attn_states.py +47 -0
  1548. vllm/v1/attention/ops/paged_attn.py +51 -0
  1549. vllm/v1/attention/ops/pallas_kv_cache_update.py +130 -0
  1550. vllm/v1/attention/ops/prefix_prefill.py +862 -0
  1551. vllm/v1/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  1552. vllm/v1/attention/ops/triton_decode_attention.py +709 -0
  1553. vllm/v1/attention/ops/triton_merge_attn_states.py +116 -0
  1554. vllm/v1/attention/ops/triton_prefill_attention.py +272 -0
  1555. vllm/v1/attention/ops/triton_reshape_and_cache_flash.py +395 -0
  1556. vllm/v1/attention/ops/triton_unified_attention.py +1088 -0
  1557. vllm/v1/attention/ops/vit_attn_wrappers.py +185 -0
  1558. vllm/v1/attention/selector.py +145 -0
  1559. vllm/v1/core/__init__.py +0 -0
  1560. vllm/v1/core/block_pool.py +489 -0
  1561. vllm/v1/core/encoder_cache_manager.py +402 -0
  1562. vllm/v1/core/kv_cache_coordinator.py +560 -0
  1563. vllm/v1/core/kv_cache_manager.py +485 -0
  1564. vllm/v1/core/kv_cache_metrics.py +96 -0
  1565. vllm/v1/core/kv_cache_utils.py +1642 -0
  1566. vllm/v1/core/sched/__init__.py +0 -0
  1567. vllm/v1/core/sched/async_scheduler.py +66 -0
  1568. vllm/v1/core/sched/interface.py +205 -0
  1569. vllm/v1/core/sched/output.py +261 -0
  1570. vllm/v1/core/sched/request_queue.py +208 -0
  1571. vllm/v1/core/sched/scheduler.py +1936 -0
  1572. vllm/v1/core/sched/utils.py +64 -0
  1573. vllm/v1/core/single_type_kv_cache_manager.py +926 -0
  1574. vllm/v1/cudagraph_dispatcher.py +183 -0
  1575. vllm/v1/engine/__init__.py +224 -0
  1576. vllm/v1/engine/async_llm.py +874 -0
  1577. vllm/v1/engine/coordinator.py +396 -0
  1578. vllm/v1/engine/core.py +1614 -0
  1579. vllm/v1/engine/core_client.py +1422 -0
  1580. vllm/v1/engine/detokenizer.py +351 -0
  1581. vllm/v1/engine/exceptions.py +18 -0
  1582. vllm/v1/engine/input_processor.py +713 -0
  1583. vllm/v1/engine/llm_engine.py +415 -0
  1584. vllm/v1/engine/logprobs.py +245 -0
  1585. vllm/v1/engine/output_processor.py +715 -0
  1586. vllm/v1/engine/parallel_sampling.py +150 -0
  1587. vllm/v1/engine/utils.py +1086 -0
  1588. vllm/v1/executor/__init__.py +6 -0
  1589. vllm/v1/executor/abstract.py +352 -0
  1590. vllm/v1/executor/multiproc_executor.py +888 -0
  1591. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1592. vllm/v1/executor/ray_executor.py +623 -0
  1593. vllm/v1/executor/ray_utils.py +468 -0
  1594. vllm/v1/executor/uniproc_executor.py +186 -0
  1595. vllm/v1/kv_cache_interface.py +485 -0
  1596. vllm/v1/kv_offload/__init__.py +0 -0
  1597. vllm/v1/kv_offload/abstract.py +161 -0
  1598. vllm/v1/kv_offload/arc_manager.py +237 -0
  1599. vllm/v1/kv_offload/backend.py +97 -0
  1600. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1601. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1602. vllm/v1/kv_offload/cpu.py +109 -0
  1603. vllm/v1/kv_offload/factory.py +58 -0
  1604. vllm/v1/kv_offload/lru_manager.py +139 -0
  1605. vllm/v1/kv_offload/mediums.py +39 -0
  1606. vllm/v1/kv_offload/spec.py +70 -0
  1607. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1608. vllm/v1/kv_offload/worker/cpu_gpu.py +287 -0
  1609. vllm/v1/kv_offload/worker/worker.py +163 -0
  1610. vllm/v1/metrics/__init__.py +0 -0
  1611. vllm/v1/metrics/loggers.py +1320 -0
  1612. vllm/v1/metrics/perf.py +1244 -0
  1613. vllm/v1/metrics/prometheus.py +82 -0
  1614. vllm/v1/metrics/ray_wrappers.py +194 -0
  1615. vllm/v1/metrics/reader.py +257 -0
  1616. vllm/v1/metrics/stats.py +440 -0
  1617. vllm/v1/outputs.py +242 -0
  1618. vllm/v1/pool/__init__.py +0 -0
  1619. vllm/v1/pool/metadata.py +124 -0
  1620. vllm/v1/request.py +281 -0
  1621. vllm/v1/sample/__init__.py +0 -0
  1622. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1623. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1624. vllm/v1/sample/logits_processor/interface.py +106 -0
  1625. vllm/v1/sample/logits_processor/state.py +165 -0
  1626. vllm/v1/sample/metadata.py +44 -0
  1627. vllm/v1/sample/ops/__init__.py +0 -0
  1628. vllm/v1/sample/ops/bad_words.py +57 -0
  1629. vllm/v1/sample/ops/logprobs.py +25 -0
  1630. vllm/v1/sample/ops/penalties.py +57 -0
  1631. vllm/v1/sample/ops/topk_topp_sampler.py +388 -0
  1632. vllm/v1/sample/rejection_sampler.py +822 -0
  1633. vllm/v1/sample/sampler.py +319 -0
  1634. vllm/v1/sample/tpu/__init__.py +0 -0
  1635. vllm/v1/sample/tpu/metadata.py +120 -0
  1636. vllm/v1/sample/tpu/sampler.py +215 -0
  1637. vllm/v1/serial_utils.py +514 -0
  1638. vllm/v1/spec_decode/__init__.py +0 -0
  1639. vllm/v1/spec_decode/eagle.py +1346 -0
  1640. vllm/v1/spec_decode/medusa.py +73 -0
  1641. vllm/v1/spec_decode/metadata.py +66 -0
  1642. vllm/v1/spec_decode/metrics.py +225 -0
  1643. vllm/v1/spec_decode/ngram_proposer.py +281 -0
  1644. vllm/v1/spec_decode/suffix_decoding.py +95 -0
  1645. vllm/v1/spec_decode/utils.py +109 -0
  1646. vllm/v1/structured_output/__init__.py +337 -0
  1647. vllm/v1/structured_output/backend_guidance.py +291 -0
  1648. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1649. vllm/v1/structured_output/backend_outlines.py +324 -0
  1650. vllm/v1/structured_output/backend_types.py +136 -0
  1651. vllm/v1/structured_output/backend_xgrammar.py +378 -0
  1652. vllm/v1/structured_output/request.py +91 -0
  1653. vllm/v1/structured_output/utils.py +457 -0
  1654. vllm/v1/utils.py +466 -0
  1655. vllm/v1/worker/__init__.py +0 -0
  1656. vllm/v1/worker/block_table.py +343 -0
  1657. vllm/v1/worker/cp_utils.py +42 -0
  1658. vllm/v1/worker/cpu_model_runner.py +122 -0
  1659. vllm/v1/worker/cpu_worker.py +192 -0
  1660. vllm/v1/worker/dp_utils.py +240 -0
  1661. vllm/v1/worker/ec_connector_model_runner_mixin.py +85 -0
  1662. vllm/v1/worker/gpu/README.md +4 -0
  1663. vllm/v1/worker/gpu/__init__.py +0 -0
  1664. vllm/v1/worker/gpu/async_utils.py +98 -0
  1665. vllm/v1/worker/gpu/attn_utils.py +183 -0
  1666. vllm/v1/worker/gpu/block_table.py +222 -0
  1667. vllm/v1/worker/gpu/buffer_utils.py +224 -0
  1668. vllm/v1/worker/gpu/cudagraph_utils.py +264 -0
  1669. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1670. vllm/v1/worker/gpu/input_batch.py +526 -0
  1671. vllm/v1/worker/gpu/metrics/__init__.py +0 -0
  1672. vllm/v1/worker/gpu/metrics/logits.py +42 -0
  1673. vllm/v1/worker/gpu/mm/__init__.py +0 -0
  1674. vllm/v1/worker/gpu/mm/mrope_utils.py +127 -0
  1675. vllm/v1/worker/gpu/model_runner.py +1005 -0
  1676. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1677. vllm/v1/worker/gpu/sample/gumbel.py +106 -0
  1678. vllm/v1/worker/gpu/sample/logit_bias.py +270 -0
  1679. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1680. vllm/v1/worker/gpu/sample/metadata.py +79 -0
  1681. vllm/v1/worker/gpu/sample/min_p.py +58 -0
  1682. vllm/v1/worker/gpu/sample/output.py +14 -0
  1683. vllm/v1/worker/gpu/sample/penalties.py +155 -0
  1684. vllm/v1/worker/gpu/sample/sampler.py +88 -0
  1685. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1686. vllm/v1/worker/gpu/spec_decode/eagle.py +566 -0
  1687. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1688. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +71 -0
  1689. vllm/v1/worker/gpu/states.py +282 -0
  1690. vllm/v1/worker/gpu/structured_outputs.py +100 -0
  1691. vllm/v1/worker/gpu_input_batch.py +1030 -0
  1692. vllm/v1/worker/gpu_model_runner.py +5761 -0
  1693. vllm/v1/worker/gpu_ubatch_wrapper.py +475 -0
  1694. vllm/v1/worker/gpu_worker.py +968 -0
  1695. vllm/v1/worker/kv_connector_model_runner_mixin.py +300 -0
  1696. vllm/v1/worker/lora_model_runner_mixin.py +225 -0
  1697. vllm/v1/worker/tpu_input_batch.py +574 -0
  1698. vllm/v1/worker/tpu_worker.py +18 -0
  1699. vllm/v1/worker/ubatch_utils.py +112 -0
  1700. vllm/v1/worker/ubatching.py +242 -0
  1701. vllm/v1/worker/utils.py +400 -0
  1702. vllm/v1/worker/worker_base.py +372 -0
  1703. vllm/v1/worker/workspace.py +253 -0
  1704. vllm/v1/worker/xpu_model_runner.py +48 -0
  1705. vllm/v1/worker/xpu_worker.py +174 -0
  1706. vllm/version.py +39 -0
  1707. vllm/vllm_flash_attn/.gitkeep +0 -0
  1708. vllm_cpu_avx512bf16-0.14.0.dist-info/METADATA +348 -0
  1709. vllm_cpu_avx512bf16-0.14.0.dist-info/RECORD +1712 -0
  1710. vllm_cpu_avx512bf16-0.14.0.dist-info/WHEEL +5 -0
  1711. vllm_cpu_avx512bf16-0.14.0.dist-info/entry_points.txt +5 -0
  1712. vllm_cpu_avx512bf16-0.14.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2566 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Adapted from
5
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
6
+ import json
7
+ import time
8
+ from http import HTTPStatus
9
+ from typing import Annotated, Any, ClassVar, Literal, TypeAlias
10
+
11
+ import regex as re
12
+ import torch
13
+ from fastapi import HTTPException, UploadFile
14
+ from openai.types.chat.chat_completion_audio import (
15
+ ChatCompletionAudio as OpenAIChatCompletionAudio,
16
+ )
17
+ from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
18
+ from openai.types.responses import (
19
+ ResponseCodeInterpreterCallCodeDeltaEvent,
20
+ ResponseCodeInterpreterCallCodeDoneEvent,
21
+ ResponseCodeInterpreterCallCompletedEvent,
22
+ ResponseCodeInterpreterCallInProgressEvent,
23
+ ResponseCodeInterpreterCallInterpretingEvent,
24
+ ResponseContentPartAddedEvent,
25
+ ResponseContentPartDoneEvent,
26
+ ResponseFunctionToolCall,
27
+ ResponseInputItemParam,
28
+ ResponseMcpCallArgumentsDeltaEvent,
29
+ ResponseMcpCallArgumentsDoneEvent,
30
+ ResponseMcpCallCompletedEvent,
31
+ ResponseMcpCallInProgressEvent,
32
+ ResponseOutputItem,
33
+ ResponseOutputItemAddedEvent,
34
+ ResponseOutputItemDoneEvent,
35
+ ResponsePrompt,
36
+ ResponseReasoningTextDeltaEvent,
37
+ ResponseReasoningTextDoneEvent,
38
+ ResponseStatus,
39
+ ResponseWebSearchCallCompletedEvent,
40
+ ResponseWebSearchCallInProgressEvent,
41
+ ResponseWebSearchCallSearchingEvent,
42
+ )
43
+ from openai.types.responses import (
44
+ ResponseCompletedEvent as OpenAIResponseCompletedEvent,
45
+ )
46
+ from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreatedEvent
47
+ from openai.types.responses import (
48
+ ResponseInProgressEvent as OpenAIResponseInProgressEvent,
49
+ )
50
+ from openai.types.responses.response_reasoning_item import (
51
+ Content as ResponseReasoningTextContent,
52
+ )
53
+ from openai_harmony import Message as OpenAIHarmonyMessage
54
+
55
+ # Backward compatibility for OpenAI client versions
56
+ try: # For older openai versions (< 1.100.0)
57
+ from openai.types.responses import ResponseTextConfig
58
+ except ImportError: # For newer openai versions (>= 1.100.0)
59
+ from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig
60
+
61
+
62
+ from openai.types.responses.response import IncompleteDetails, ToolChoice
63
+ from openai.types.responses.tool import Tool
64
+ from openai.types.shared import Metadata, Reasoning
65
+ from pydantic import (
66
+ BaseModel,
67
+ ConfigDict,
68
+ Field,
69
+ ValidationError,
70
+ field_serializer,
71
+ model_validator,
72
+ )
73
+
74
+ from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
75
+ from vllm.exceptions import VLLMValidationError
76
+ from vllm.logger import init_logger
77
+ from vllm.logprobs import Logprob
78
+ from vllm.sampling_params import (
79
+ BeamSearchParams,
80
+ RequestOutputKind,
81
+ SamplingParams,
82
+ StructuredOutputsParams,
83
+ )
84
+ from vllm.utils import random_uuid
85
+ from vllm.utils.import_utils import resolve_obj_by_qualname
86
+
87
+ logger = init_logger(__name__)
88
+
89
+ _LONG_INFO = torch.iinfo(torch.long)
90
+
91
+
92
+ class OpenAIBaseModel(BaseModel):
93
+ # OpenAI API does allow extra fields
94
+ model_config = ConfigDict(extra="allow")
95
+
96
+ # Cache class field names
97
+ field_names: ClassVar[set[str] | None] = None
98
+
99
+ @model_validator(mode="wrap")
100
+ @classmethod
101
+ def __log_extra_fields__(cls, data, handler):
102
+ result = handler(data)
103
+ if not isinstance(data, dict):
104
+ return result
105
+ field_names = cls.field_names
106
+ if field_names is None:
107
+ # Get all class field names and their potential aliases
108
+ field_names = set()
109
+ for field_name, field in cls.model_fields.items():
110
+ field_names.add(field_name)
111
+ if alias := getattr(field, "alias", None):
112
+ field_names.add(alias)
113
+ cls.field_names = field_names
114
+
115
+ # Compare against both field names and aliases
116
+ if any(k not in field_names for k in data):
117
+ logger.warning(
118
+ "The following fields were present in the request but ignored: %s",
119
+ data.keys() - field_names,
120
+ )
121
+ return result
122
+
123
+
124
+ class ErrorInfo(OpenAIBaseModel):
125
+ message: str
126
+ type: str
127
+ param: str | None = None
128
+ code: int
129
+
130
+
131
+ class ErrorResponse(OpenAIBaseModel):
132
+ error: ErrorInfo
133
+
134
+
135
+ class ModelPermission(OpenAIBaseModel):
136
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
137
+ object: str = "model_permission"
138
+ created: int = Field(default_factory=lambda: int(time.time()))
139
+ allow_create_engine: bool = False
140
+ allow_sampling: bool = True
141
+ allow_logprobs: bool = True
142
+ allow_search_indices: bool = False
143
+ allow_view: bool = True
144
+ allow_fine_tuning: bool = False
145
+ organization: str = "*"
146
+ group: str | None = None
147
+ is_blocking: bool = False
148
+
149
+
150
+ class ModelCard(OpenAIBaseModel):
151
+ id: str
152
+ object: str = "model"
153
+ created: int = Field(default_factory=lambda: int(time.time()))
154
+ owned_by: str = "vllm"
155
+ root: str | None = None
156
+ parent: str | None = None
157
+ max_model_len: int | None = None
158
+ permission: list[ModelPermission] = Field(default_factory=list)
159
+
160
+
161
+ class ModelList(OpenAIBaseModel):
162
+ object: str = "list"
163
+ data: list[ModelCard] = Field(default_factory=list)
164
+
165
+
166
+ class PromptTokenUsageInfo(OpenAIBaseModel):
167
+ cached_tokens: int | None = None
168
+
169
+
170
+ class UsageInfo(OpenAIBaseModel):
171
+ prompt_tokens: int = 0
172
+ total_tokens: int = 0
173
+ completion_tokens: int | None = 0
174
+ prompt_tokens_details: PromptTokenUsageInfo | None = None
175
+
176
+
177
+ class RequestResponseMetadata(BaseModel):
178
+ request_id: str
179
+ final_usage_info: UsageInfo | None = None
180
+
181
+
182
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
183
+ name: str
184
+ description: str | None = None
185
+ # schema is the field in openai but that causes conflicts with pydantic so
186
+ # instead use json_schema with an alias
187
+ json_schema: dict[str, Any] | None = Field(default=None, alias="schema")
188
+ strict: bool | None = None
189
+
190
+
191
+ class LegacyStructuralTag(OpenAIBaseModel):
192
+ begin: str
193
+ # schema is the field, but that causes conflicts with pydantic so
194
+ # instead use structural_tag_schema with an alias
195
+ structural_tag_schema: dict[str, Any] | None = Field(default=None, alias="schema")
196
+ end: str
197
+
198
+
199
+ class LegacyStructuralTagResponseFormat(OpenAIBaseModel):
200
+ type: Literal["structural_tag"]
201
+ structures: list[LegacyStructuralTag]
202
+ triggers: list[str]
203
+
204
+
205
+ class StructuralTagResponseFormat(OpenAIBaseModel):
206
+ type: Literal["structural_tag"]
207
+ format: Any
208
+
209
+
210
+ AnyStructuralTagResponseFormat: TypeAlias = (
211
+ LegacyStructuralTagResponseFormat | StructuralTagResponseFormat
212
+ )
213
+
214
+
215
+ class ResponseFormat(OpenAIBaseModel):
216
+ # type must be "json_schema", "json_object", or "text"
217
+ type: Literal["text", "json_object", "json_schema"]
218
+ json_schema: JsonSchemaResponseFormat | None = None
219
+
220
+
221
+ AnyResponseFormat: TypeAlias = (
222
+ ResponseFormat | StructuralTagResponseFormat | LegacyStructuralTagResponseFormat
223
+ )
224
+
225
+
226
+ class StreamOptions(OpenAIBaseModel):
227
+ include_usage: bool | None = True
228
+ continuous_usage_stats: bool | None = False
229
+
230
+
231
+ class FunctionDefinition(OpenAIBaseModel):
232
+ name: str
233
+ description: str | None = None
234
+ parameters: dict[str, Any] | None = None
235
+
236
+
237
+ class ChatCompletionToolsParam(OpenAIBaseModel):
238
+ type: Literal["function"] = "function"
239
+ function: FunctionDefinition
240
+
241
+
242
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
243
+ name: str
244
+
245
+
246
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
247
+ function: ChatCompletionNamedFunction
248
+ type: Literal["function"] = "function"
249
+
250
+
251
+ # extra="forbid" is a workaround to have kwargs as a field,
252
+ # see https://github.com/pydantic/pydantic/issues/3125
253
+ class LogitsProcessorConstructor(BaseModel):
254
+ qualname: str
255
+ args: list[Any] | None = None
256
+ kwargs: dict[str, Any] | None = None
257
+
258
+ model_config = ConfigDict(extra="forbid")
259
+
260
+
261
+ LogitsProcessors = list[str | LogitsProcessorConstructor]
262
+
263
+
264
+ def get_logits_processors(
265
+ processors: LogitsProcessors | None, pattern: str | None
266
+ ) -> list[Any] | None:
267
+ if processors and pattern:
268
+ logits_processors = []
269
+ for processor in processors:
270
+ qualname = processor if isinstance(processor, str) else processor.qualname
271
+ if not re.match(pattern, qualname):
272
+ raise ValueError(
273
+ f"Logits processor '{qualname}' is not allowed by this "
274
+ "server. See --logits-processor-pattern engine argument "
275
+ "for more information."
276
+ )
277
+ try:
278
+ logits_processor = resolve_obj_by_qualname(qualname)
279
+ except Exception as e:
280
+ raise ValueError(
281
+ f"Logits processor '{qualname}' could not be resolved: {e}"
282
+ ) from e
283
+ if isinstance(processor, LogitsProcessorConstructor):
284
+ logits_processor = logits_processor(
285
+ *processor.args or [], **processor.kwargs or {}
286
+ )
287
+ logits_processors.append(logits_processor)
288
+ return logits_processors
289
+ elif processors:
290
+ raise ValueError(
291
+ "The `logits_processors` argument is not supported by this "
292
+ "server. See --logits-processor-pattern engine argument "
293
+ "for more information."
294
+ )
295
+ return None
296
+
297
+
298
+ ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem
299
+
300
+
301
+ class ResponsesRequest(OpenAIBaseModel):
302
+ # Ordered by official OpenAI API documentation
303
+ # https://platform.openai.com/docs/api-reference/responses/create
304
+ background: bool | None = False
305
+ include: (
306
+ list[
307
+ Literal[
308
+ "code_interpreter_call.outputs",
309
+ "computer_call_output.output.image_url",
310
+ "file_search_call.results",
311
+ "message.input_image.image_url",
312
+ "message.output_text.logprobs",
313
+ "reasoning.encrypted_content",
314
+ ],
315
+ ]
316
+ | None
317
+ ) = None
318
+ input: str | list[ResponseInputOutputItem]
319
+ instructions: str | None = None
320
+ max_output_tokens: int | None = None
321
+ max_tool_calls: int | None = None
322
+ metadata: Metadata | None = None
323
+ model: str | None = None
324
+ logit_bias: dict[str, float] | None = None
325
+ parallel_tool_calls: bool | None = True
326
+ previous_response_id: str | None = None
327
+ prompt: ResponsePrompt | None = None
328
+ reasoning: Reasoning | None = None
329
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
330
+ store: bool | None = True
331
+ stream: bool | None = False
332
+ temperature: float | None = None
333
+ text: ResponseTextConfig | None = None
334
+ tool_choice: ToolChoice = "auto"
335
+ tools: list[Tool] = Field(default_factory=list)
336
+ top_logprobs: int | None = 0
337
+ top_p: float | None = None
338
+ top_k: int | None = None
339
+ truncation: Literal["auto", "disabled"] | None = "disabled"
340
+ user: str | None = None
341
+
342
+ # --8<-- [start:responses-extra-params]
343
+ request_id: str = Field(
344
+ default_factory=lambda: f"resp_{random_uuid()}",
345
+ description=(
346
+ "The request_id related to this request. If the caller does "
347
+ "not set it, a random_uuid will be generated. This id is used "
348
+ "through out the inference process and return in response."
349
+ ),
350
+ )
351
+ mm_processor_kwargs: dict[str, Any] | None = Field(
352
+ default=None,
353
+ description=("Additional kwargs to pass to the HF processor."),
354
+ )
355
+ priority: int = Field(
356
+ default=0,
357
+ description=(
358
+ "The priority of the request (lower means earlier handling; "
359
+ "default: 0). Any priority other than 0 will raise an error "
360
+ "if the served model does not use priority scheduling."
361
+ ),
362
+ )
363
+ cache_salt: str | None = Field(
364
+ default=None,
365
+ description=(
366
+ "If specified, the prefix cache will be salted with the provided "
367
+ "string to prevent an attacker to guess prompts in multi-user "
368
+ "environments. The salt should be random, protected from "
369
+ "access by 3rd parties, and long enough to be "
370
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
371
+ "to 256 bit)."
372
+ ),
373
+ )
374
+
375
+ enable_response_messages: bool = Field(
376
+ default=False,
377
+ description=(
378
+ "Dictates whether or not to return messages as part of the "
379
+ "response object. Currently only supported for"
380
+ "non-background and gpt-oss only. "
381
+ ),
382
+ )
383
+ # similar to input_messages / output_messages in ResponsesResponse
384
+ # we take in previous_input_messages (ie in harmony format)
385
+ # this cannot be used in conjunction with previous_response_id
386
+ # TODO: consider supporting non harmony messages as well
387
+ previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None
388
+ # --8<-- [end:responses-extra-params]
389
+
390
+ _DEFAULT_SAMPLING_PARAMS = {
391
+ "temperature": 1.0,
392
+ "top_p": 1.0,
393
+ "top_k": 0,
394
+ }
395
+
396
+ def to_sampling_params(
397
+ self,
398
+ default_max_tokens: int,
399
+ default_sampling_params: dict | None = None,
400
+ ) -> SamplingParams:
401
+ if self.max_output_tokens is None:
402
+ max_tokens = default_max_tokens
403
+ else:
404
+ max_tokens = min(self.max_output_tokens, default_max_tokens)
405
+
406
+ default_sampling_params = default_sampling_params or {}
407
+ if (temperature := self.temperature) is None:
408
+ temperature = default_sampling_params.get(
409
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
410
+ )
411
+ if (top_p := self.top_p) is None:
412
+ top_p = default_sampling_params.get(
413
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
414
+ )
415
+ if (top_k := self.top_k) is None:
416
+ top_k = default_sampling_params.get(
417
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
418
+ )
419
+ stop_token_ids = default_sampling_params.get("stop_token_ids")
420
+
421
+ # Structured output
422
+ structured_outputs = None
423
+ if self.text is not None and self.text.format is not None:
424
+ response_format = self.text.format
425
+ if (
426
+ response_format.type == "json_schema"
427
+ and response_format.schema_ is not None
428
+ ):
429
+ structured_outputs = StructuredOutputsParams(
430
+ json=response_format.schema_
431
+ )
432
+ elif response_format.type == "json_object":
433
+ raise NotImplementedError("json_object is not supported")
434
+
435
+ # TODO: add more parameters
436
+ return SamplingParams.from_optional(
437
+ temperature=temperature,
438
+ top_p=top_p,
439
+ top_k=top_k,
440
+ max_tokens=max_tokens,
441
+ logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
442
+ stop_token_ids=stop_token_ids,
443
+ output_kind=(
444
+ RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
445
+ ),
446
+ structured_outputs=structured_outputs,
447
+ logit_bias=self.logit_bias,
448
+ skip_clone=True, # Created fresh per request, safe to skip clone
449
+ )
450
+
451
+ def is_include_output_logprobs(self) -> bool:
452
+ """Check if the request includes output logprobs."""
453
+ if self.include is None:
454
+ return False
455
+ return (
456
+ isinstance(self.include, list)
457
+ and "message.output_text.logprobs" in self.include
458
+ )
459
+
460
+ @model_validator(mode="before")
461
+ def validate_background(cls, data):
462
+ if not data.get("background"):
463
+ return data
464
+ if not data.get("store", True):
465
+ raise ValueError("background can only be used when `store` is true")
466
+ return data
467
+
468
+ @model_validator(mode="before")
469
+ def validate_prompt(cls, data):
470
+ if data.get("prompt") is not None:
471
+ raise VLLMValidationError(
472
+ "prompt template is not supported", parameter="prompt"
473
+ )
474
+ return data
475
+
476
+ @model_validator(mode="before")
477
+ def check_cache_salt_support(cls, data):
478
+ if data.get("cache_salt") is not None and (
479
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
480
+ ):
481
+ raise ValueError(
482
+ "Parameter 'cache_salt' must be a non-empty string if provided."
483
+ )
484
+ return data
485
+
486
+ @model_validator(mode="before")
487
+ def function_call_parsing(cls, data):
488
+ """Parse function_call dictionaries into ResponseFunctionToolCall objects.
489
+ This ensures Pydantic can properly resolve union types in the input field.
490
+ Function calls provided as dicts are converted to ResponseFunctionToolCall
491
+ objects before validation, while invalid structures are left for Pydantic
492
+ to reject with appropriate error messages.
493
+ """
494
+
495
+ input_data = data.get("input")
496
+
497
+ # Early return for None, strings, or bytes
498
+ # (strings are iterable but shouldn't be processed)
499
+ if input_data is None or isinstance(input_data, (str, bytes)):
500
+ return data
501
+
502
+ # Convert iterators (like ValidatorIterator) to list
503
+ if not isinstance(input_data, list):
504
+ try:
505
+ input_data = list(input_data)
506
+ except TypeError:
507
+ # Not iterable, leave as-is for Pydantic to handle
508
+ return data
509
+
510
+ processed_input = []
511
+ for item in input_data:
512
+ if isinstance(item, dict) and item.get("type") == "function_call":
513
+ try:
514
+ processed_input.append(ResponseFunctionToolCall(**item))
515
+ except ValidationError:
516
+ # Let Pydantic handle validation for malformed function calls
517
+ logger.debug(
518
+ "Failed to parse function_call to ResponseFunctionToolCall, "
519
+ "leaving for Pydantic validation"
520
+ )
521
+ processed_input.append(item)
522
+ else:
523
+ processed_input.append(item)
524
+
525
+ data["input"] = processed_input
526
+ return data
527
+
528
+
529
+ class ChatCompletionRequest(OpenAIBaseModel):
530
+ # Ordered by official OpenAI API documentation
531
+ # https://platform.openai.com/docs/api-reference/chat/create
532
+ messages: list[ChatCompletionMessageParam]
533
+ model: str | None = None
534
+ frequency_penalty: float | None = 0.0
535
+ logit_bias: dict[str, float] | None = None
536
+ logprobs: bool | None = False
537
+ top_logprobs: int | None = 0
538
+ max_tokens: int | None = Field(
539
+ default=None,
540
+ deprecated="max_tokens is deprecated in favor of "
541
+ "the max_completion_tokens field",
542
+ )
543
+ max_completion_tokens: int | None = None
544
+ n: int | None = 1
545
+ presence_penalty: float | None = 0.0
546
+ response_format: AnyResponseFormat | None = None
547
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
548
+ stop: str | list[str] | None = []
549
+ stream: bool | None = False
550
+ stream_options: StreamOptions | None = None
551
+ temperature: float | None = None
552
+ top_p: float | None = None
553
+ tools: list[ChatCompletionToolsParam] | None = None
554
+ tool_choice: (
555
+ Literal["none"]
556
+ | Literal["auto"]
557
+ | Literal["required"]
558
+ | ChatCompletionNamedToolChoiceParam
559
+ | None
560
+ ) = "none"
561
+ reasoning_effort: Literal["low", "medium", "high"] | None = None
562
+ include_reasoning: bool = True
563
+ parallel_tool_calls: bool | None = True
564
+
565
+ # NOTE this will be ignored by vLLM
566
+ user: str | None = None
567
+
568
+ # --8<-- [start:chat-completion-sampling-params]
569
+ use_beam_search: bool = False
570
+ top_k: int | None = None
571
+ min_p: float | None = None
572
+ repetition_penalty: float | None = None
573
+ length_penalty: float = 1.0
574
+ stop_token_ids: list[int] | None = []
575
+ include_stop_str_in_output: bool = False
576
+ ignore_eos: bool = False
577
+ min_tokens: int = 0
578
+ skip_special_tokens: bool = True
579
+ spaces_between_special_tokens: bool = True
580
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
581
+ None
582
+ )
583
+ prompt_logprobs: int | None = None
584
+ allowed_token_ids: list[int] | None = None
585
+ bad_words: list[str] = Field(default_factory=list)
586
+ # --8<-- [end:chat-completion-sampling-params]
587
+
588
+ # --8<-- [start:chat-completion-extra-params]
589
+ echo: bool = Field(
590
+ default=False,
591
+ description=(
592
+ "If true, the new message will be prepended with the last message "
593
+ "if they belong to the same role."
594
+ ),
595
+ )
596
+ add_generation_prompt: bool = Field(
597
+ default=True,
598
+ description=(
599
+ "If true, the generation prompt will be added to the chat template. "
600
+ "This is a parameter used by chat template in tokenizer config of the "
601
+ "model."
602
+ ),
603
+ )
604
+ continue_final_message: bool = Field(
605
+ default=False,
606
+ description=(
607
+ "If this is set, the chat will be formatted so that the final "
608
+ "message in the chat is open-ended, without any EOS tokens. The "
609
+ "model will continue this message rather than starting a new one. "
610
+ 'This allows you to "prefill" part of the model\'s response for it. '
611
+ "Cannot be used at the same time as `add_generation_prompt`."
612
+ ),
613
+ )
614
+ add_special_tokens: bool = Field(
615
+ default=False,
616
+ description=(
617
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
618
+ "on top of what is added by the chat template. "
619
+ "For most models, the chat template takes care of adding the "
620
+ "special tokens so this should be set to false (as is the "
621
+ "default)."
622
+ ),
623
+ )
624
+ documents: list[dict[str, str]] | None = Field(
625
+ default=None,
626
+ description=(
627
+ "A list of dicts representing documents that will be accessible to "
628
+ "the model if it is performing RAG (retrieval-augmented generation)."
629
+ " If the template does not support RAG, this argument will have no "
630
+ "effect. We recommend that each document should be a dict containing "
631
+ '"title" and "text" keys.'
632
+ ),
633
+ )
634
+ chat_template: str | None = Field(
635
+ default=None,
636
+ description=(
637
+ "A Jinja template to use for this conversion. "
638
+ "As of transformers v4.44, default chat template is no longer "
639
+ "allowed, so you must provide a chat template if the tokenizer "
640
+ "does not define one."
641
+ ),
642
+ )
643
+ chat_template_kwargs: dict[str, Any] | None = Field(
644
+ default=None,
645
+ description=(
646
+ "Additional keyword args to pass to the template renderer. "
647
+ "Will be accessible by the chat template."
648
+ ),
649
+ )
650
+ mm_processor_kwargs: dict[str, Any] | None = Field(
651
+ default=None,
652
+ description=("Additional kwargs to pass to the HF processor."),
653
+ )
654
+ structured_outputs: StructuredOutputsParams | None = Field(
655
+ default=None,
656
+ description="Additional kwargs for structured outputs",
657
+ )
658
+ priority: int = Field(
659
+ default=0,
660
+ description=(
661
+ "The priority of the request (lower means earlier handling; "
662
+ "default: 0). Any priority other than 0 will raise an error "
663
+ "if the served model does not use priority scheduling."
664
+ ),
665
+ )
666
+ request_id: str = Field(
667
+ default_factory=random_uuid,
668
+ description=(
669
+ "The request_id related to this request. If the caller does "
670
+ "not set it, a random_uuid will be generated. This id is used "
671
+ "through out the inference process and return in response."
672
+ ),
673
+ )
674
+ logits_processors: LogitsProcessors | None = Field(
675
+ default=None,
676
+ description=(
677
+ "A list of either qualified names of logits processors, or "
678
+ "constructor objects, to apply when sampling. A constructor is "
679
+ "a JSON object with a required 'qualname' field specifying the "
680
+ "qualified name of the processor class/factory, and optional "
681
+ "'args' and 'kwargs' fields containing positional and keyword "
682
+ "arguments. For example: {'qualname': "
683
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
684
+ "{'param': 'value'}}."
685
+ ),
686
+ )
687
+ return_tokens_as_token_ids: bool | None = Field(
688
+ default=None,
689
+ description=(
690
+ "If specified with 'logprobs', tokens are represented "
691
+ " as strings of the form 'token_id:{token_id}' so that tokens "
692
+ "that are not JSON-encodable can be identified."
693
+ ),
694
+ )
695
+ return_token_ids: bool | None = Field(
696
+ default=None,
697
+ description=(
698
+ "If specified, the result will include token IDs alongside the "
699
+ "generated text. In streaming mode, prompt_token_ids is included "
700
+ "only in the first chunk, and token_ids contains the delta tokens "
701
+ "for each chunk. This is useful for debugging or when you "
702
+ "need to map generated text back to input tokens."
703
+ ),
704
+ )
705
+ cache_salt: str | None = Field(
706
+ default=None,
707
+ description=(
708
+ "If specified, the prefix cache will be salted with the provided "
709
+ "string to prevent an attacker to guess prompts in multi-user "
710
+ "environments. The salt should be random, protected from "
711
+ "access by 3rd parties, and long enough to be "
712
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
713
+ "to 256 bit)."
714
+ ),
715
+ )
716
+ kv_transfer_params: dict[str, Any] | None = Field(
717
+ default=None,
718
+ description="KVTransfer parameters used for disaggregated serving.",
719
+ )
720
+
721
+ vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
722
+ default=None,
723
+ description=(
724
+ "Additional request parameters with (list of) string or "
725
+ "numeric values, used by custom extensions."
726
+ ),
727
+ )
728
+
729
+ # --8<-- [end:chat-completion-extra-params]
730
+
731
+ # Default sampling parameters for chat completion requests
732
+ _DEFAULT_SAMPLING_PARAMS: dict = {
733
+ "repetition_penalty": 1.0,
734
+ "temperature": 1.0,
735
+ "top_p": 1.0,
736
+ "top_k": 0,
737
+ "min_p": 0.0,
738
+ }
739
+
740
+ def to_beam_search_params(
741
+ self, max_tokens: int, default_sampling_params: dict
742
+ ) -> BeamSearchParams:
743
+ n = self.n if self.n is not None else 1
744
+ if (temperature := self.temperature) is None:
745
+ temperature = default_sampling_params.get(
746
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
747
+ )
748
+
749
+ return BeamSearchParams(
750
+ beam_width=n,
751
+ max_tokens=max_tokens,
752
+ ignore_eos=self.ignore_eos,
753
+ temperature=temperature,
754
+ length_penalty=self.length_penalty,
755
+ include_stop_str_in_output=self.include_stop_str_in_output,
756
+ )
757
+
758
+ def to_sampling_params(
759
+ self,
760
+ max_tokens: int,
761
+ logits_processor_pattern: str | None,
762
+ default_sampling_params: dict,
763
+ ) -> SamplingParams:
764
+ # Default parameters
765
+ if (repetition_penalty := self.repetition_penalty) is None:
766
+ repetition_penalty = default_sampling_params.get(
767
+ "repetition_penalty",
768
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
769
+ )
770
+ if (temperature := self.temperature) is None:
771
+ temperature = default_sampling_params.get(
772
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
773
+ )
774
+ if (top_p := self.top_p) is None:
775
+ top_p = default_sampling_params.get(
776
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
777
+ )
778
+ if (top_k := self.top_k) is None:
779
+ top_k = default_sampling_params.get(
780
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
781
+ )
782
+ if (min_p := self.min_p) is None:
783
+ min_p = default_sampling_params.get(
784
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
785
+ )
786
+
787
+ prompt_logprobs = self.prompt_logprobs
788
+ if prompt_logprobs is None and self.echo:
789
+ prompt_logprobs = self.top_logprobs
790
+
791
+ response_format = self.response_format
792
+ if response_format is not None:
793
+ # If structured outputs wasn't already enabled,
794
+ # we must enable it for these features to work
795
+ if self.structured_outputs is None:
796
+ self.structured_outputs = StructuredOutputsParams()
797
+
798
+ # Set structured output params for response format
799
+ if response_format.type == "json_object":
800
+ self.structured_outputs.json_object = True
801
+ elif response_format.type == "json_schema":
802
+ json_schema = response_format.json_schema
803
+ assert json_schema is not None
804
+ self.structured_outputs.json = json_schema.json_schema
805
+ elif response_format.type == "structural_tag":
806
+ structural_tag = response_format
807
+ assert structural_tag is not None and isinstance(
808
+ structural_tag,
809
+ (
810
+ LegacyStructuralTagResponseFormat,
811
+ StructuralTagResponseFormat,
812
+ ),
813
+ )
814
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
815
+ self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
816
+
817
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
818
+ if self.kv_transfer_params:
819
+ # Pass in kv_transfer_params via extra_args
820
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
821
+ return SamplingParams.from_optional(
822
+ n=self.n,
823
+ presence_penalty=self.presence_penalty,
824
+ frequency_penalty=self.frequency_penalty,
825
+ repetition_penalty=repetition_penalty,
826
+ temperature=temperature,
827
+ top_p=top_p,
828
+ top_k=top_k,
829
+ min_p=min_p,
830
+ seed=self.seed,
831
+ stop=self.stop,
832
+ stop_token_ids=self.stop_token_ids,
833
+ logprobs=self.top_logprobs if self.logprobs else None,
834
+ prompt_logprobs=prompt_logprobs,
835
+ ignore_eos=self.ignore_eos,
836
+ max_tokens=max_tokens,
837
+ min_tokens=self.min_tokens,
838
+ skip_special_tokens=self.skip_special_tokens,
839
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
840
+ logits_processors=get_logits_processors(
841
+ self.logits_processors, logits_processor_pattern
842
+ ),
843
+ include_stop_str_in_output=self.include_stop_str_in_output,
844
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
845
+ output_kind=RequestOutputKind.DELTA
846
+ if self.stream
847
+ else RequestOutputKind.FINAL_ONLY,
848
+ structured_outputs=self.structured_outputs,
849
+ logit_bias=self.logit_bias,
850
+ bad_words=self.bad_words,
851
+ allowed_token_ids=self.allowed_token_ids,
852
+ extra_args=extra_args or None,
853
+ skip_clone=True, # Created fresh per request, safe to skip clone
854
+ )
855
+
856
+ @model_validator(mode="before")
857
+ @classmethod
858
+ def validate_stream_options(cls, data):
859
+ if data.get("stream_options") and not data.get("stream"):
860
+ raise VLLMValidationError(
861
+ "Stream options can only be defined when `stream=True`.",
862
+ parameter="stream_options",
863
+ )
864
+
865
+ return data
866
+
867
+ @model_validator(mode="before")
868
+ @classmethod
869
+ def check_logprobs(cls, data):
870
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
871
+ if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
872
+ raise VLLMValidationError(
873
+ "`prompt_logprobs` are not available when `stream=True`.",
874
+ parameter="prompt_logprobs",
875
+ )
876
+
877
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
878
+ raise VLLMValidationError(
879
+ "`prompt_logprobs` must be a positive value or -1.",
880
+ parameter="prompt_logprobs",
881
+ value=prompt_logprobs,
882
+ )
883
+ if (top_logprobs := data.get("top_logprobs")) is not None:
884
+ if top_logprobs < 0 and top_logprobs != -1:
885
+ raise VLLMValidationError(
886
+ "`top_logprobs` must be a positive value or -1.",
887
+ parameter="top_logprobs",
888
+ value=top_logprobs,
889
+ )
890
+
891
+ if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
892
+ raise VLLMValidationError(
893
+ "when using `top_logprobs`, `logprobs` must be set to true.",
894
+ parameter="top_logprobs",
895
+ )
896
+
897
+ return data
898
+
899
+ @model_validator(mode="before")
900
+ @classmethod
901
+ def check_structured_outputs_count(cls, data):
902
+ if isinstance(data, ValueError):
903
+ raise data
904
+
905
+ if data.get("structured_outputs", None) is None:
906
+ return data
907
+
908
+ structured_outputs_kwargs = data["structured_outputs"]
909
+ count = sum(
910
+ structured_outputs_kwargs.get(k) is not None
911
+ for k in ("json", "regex", "choice")
912
+ )
913
+ # you can only use one kind of constraints for structured outputs
914
+ if count > 1:
915
+ raise ValueError(
916
+ "You can only use one kind of constraints for structured "
917
+ "outputs ('json', 'regex' or 'choice')."
918
+ )
919
+ # you can only either use structured outputs or tools, not both
920
+ if count > 1 and data.get("tool_choice", "none") not in (
921
+ "none",
922
+ "auto",
923
+ "required",
924
+ ):
925
+ raise ValueError(
926
+ "You can only either use constraints for structured outputs "
927
+ "or tools, not both."
928
+ )
929
+ return data
930
+
931
+ @model_validator(mode="before")
932
+ @classmethod
933
+ def check_tool_usage(cls, data):
934
+ # if "tool_choice" is not specified but tools are provided,
935
+ # default to "auto" tool_choice
936
+ if "tool_choice" not in data and data.get("tools"):
937
+ data["tool_choice"] = "auto"
938
+
939
+ # if "tool_choice" is "none" -- no validation is needed for tools
940
+ if "tool_choice" in data and data["tool_choice"] == "none":
941
+ return data
942
+
943
+ # if "tool_choice" is specified -- validation
944
+ if "tool_choice" in data and data["tool_choice"] is not None:
945
+ # ensure that if "tool choice" is specified, tools are present
946
+ if "tools" not in data or data["tools"] is None:
947
+ raise ValueError("When using `tool_choice`, `tools` must be set.")
948
+
949
+ # make sure that tool choice is either a named tool
950
+ # OR that it's set to "auto" or "required"
951
+ if data["tool_choice"] not in ["auto", "required"] and not isinstance(
952
+ data["tool_choice"], dict
953
+ ):
954
+ raise ValueError(
955
+ f"Invalid value for `tool_choice`: {data['tool_choice']}! "
956
+ 'Only named tools, "none", "auto" or "required" '
957
+ "are supported."
958
+ )
959
+
960
+ # if tool_choice is "required" but the "tools" list is empty,
961
+ # override the data to behave like "none" to align with
962
+ # OpenAI’s behavior.
963
+ if (
964
+ data["tool_choice"] == "required"
965
+ and isinstance(data["tools"], list)
966
+ and len(data["tools"]) == 0
967
+ ):
968
+ data["tool_choice"] = "none"
969
+ del data["tools"]
970
+ return data
971
+
972
+ # ensure that if "tool_choice" is specified as an object,
973
+ # it matches a valid tool
974
+ correct_usage_message = (
975
+ 'Correct usage: `{"type": "function",'
976
+ ' "function": {"name": "my_function"}}`'
977
+ )
978
+ if isinstance(data["tool_choice"], dict):
979
+ valid_tool = False
980
+ function = data["tool_choice"].get("function")
981
+ if not isinstance(function, dict):
982
+ raise ValueError(
983
+ f"Invalid value for `function`: `{function}` in "
984
+ f"`tool_choice`! {correct_usage_message}"
985
+ )
986
+ if "name" not in function:
987
+ raise ValueError(
988
+ f"Expected field `name` in `function` in "
989
+ f"`tool_choice`! {correct_usage_message}"
990
+ )
991
+ function_name = function["name"]
992
+ if not isinstance(function_name, str) or len(function_name) == 0:
993
+ raise ValueError(
994
+ f"Invalid `name` in `function`: `{function_name}`"
995
+ f" in `tool_choice`! {correct_usage_message}"
996
+ )
997
+ for tool in data["tools"]:
998
+ if tool["function"]["name"] == function_name:
999
+ valid_tool = True
1000
+ break
1001
+ if not valid_tool:
1002
+ raise ValueError(
1003
+ "The tool specified in `tool_choice` does not match any"
1004
+ " of the specified `tools`"
1005
+ )
1006
+ return data
1007
+
1008
+ @model_validator(mode="before")
1009
+ @classmethod
1010
+ def check_generation_prompt(cls, data):
1011
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
1012
+ raise ValueError(
1013
+ "Cannot set both `continue_final_message` and "
1014
+ "`add_generation_prompt` to True."
1015
+ )
1016
+ return data
1017
+
1018
+ @model_validator(mode="before")
1019
+ @classmethod
1020
+ def check_cache_salt_support(cls, data):
1021
+ if data.get("cache_salt") is not None and (
1022
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
1023
+ ):
1024
+ raise ValueError(
1025
+ "Parameter 'cache_salt' must be a non-empty string if provided."
1026
+ )
1027
+ return data
1028
+
1029
+
1030
+ class CompletionRequest(OpenAIBaseModel):
1031
+ # Ordered by official OpenAI API documentation
1032
+ # https://platform.openai.com/docs/api-reference/completions/create
1033
+ model: str | None = None
1034
+ prompt: list[int] | list[list[int]] | str | list[str] | None = None
1035
+ echo: bool | None = False
1036
+ frequency_penalty: float | None = 0.0
1037
+ logit_bias: dict[str, float] | None = None
1038
+ logprobs: int | None = None
1039
+ max_tokens: int | None = 16
1040
+ n: int = 1
1041
+ presence_penalty: float | None = 0.0
1042
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1043
+ stop: str | list[str] | None = []
1044
+ stream: bool | None = False
1045
+ stream_options: StreamOptions | None = None
1046
+ suffix: str | None = None
1047
+ temperature: float | None = None
1048
+ top_p: float | None = None
1049
+ user: str | None = None
1050
+
1051
+ # --8<-- [start:completion-sampling-params]
1052
+ use_beam_search: bool = False
1053
+ top_k: int | None = None
1054
+ min_p: float | None = None
1055
+ repetition_penalty: float | None = None
1056
+ length_penalty: float = 1.0
1057
+ stop_token_ids: list[int] | None = []
1058
+ include_stop_str_in_output: bool = False
1059
+ ignore_eos: bool = False
1060
+ min_tokens: int = 0
1061
+ skip_special_tokens: bool = True
1062
+ spaces_between_special_tokens: bool = True
1063
+ truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
1064
+ None
1065
+ )
1066
+ allowed_token_ids: list[int] | None = None
1067
+ prompt_logprobs: int | None = None
1068
+ # --8<-- [end:completion-sampling-params]
1069
+
1070
+ # --8<-- [start:completion-extra-params]
1071
+ prompt_embeds: bytes | list[bytes] | None = None
1072
+ add_special_tokens: bool = Field(
1073
+ default=True,
1074
+ description=(
1075
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1076
+ "the prompt."
1077
+ ),
1078
+ )
1079
+ response_format: AnyResponseFormat | None = Field(
1080
+ default=None,
1081
+ description=(
1082
+ "Similar to chat completion, this parameter specifies the format "
1083
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
1084
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
1085
+ ),
1086
+ )
1087
+ structured_outputs: StructuredOutputsParams | None = Field(
1088
+ default=None,
1089
+ description="Additional kwargs for structured outputs",
1090
+ )
1091
+ priority: int = Field(
1092
+ default=0,
1093
+ description=(
1094
+ "The priority of the request (lower means earlier handling; "
1095
+ "default: 0). Any priority other than 0 will raise an error "
1096
+ "if the served model does not use priority scheduling."
1097
+ ),
1098
+ )
1099
+ request_id: str = Field(
1100
+ default_factory=random_uuid,
1101
+ description=(
1102
+ "The request_id related to this request. If the caller does "
1103
+ "not set it, a random_uuid will be generated. This id is used "
1104
+ "through out the inference process and return in response."
1105
+ ),
1106
+ )
1107
+ logits_processors: LogitsProcessors | None = Field(
1108
+ default=None,
1109
+ description=(
1110
+ "A list of either qualified names of logits processors, or "
1111
+ "constructor objects, to apply when sampling. A constructor is "
1112
+ "a JSON object with a required 'qualname' field specifying the "
1113
+ "qualified name of the processor class/factory, and optional "
1114
+ "'args' and 'kwargs' fields containing positional and keyword "
1115
+ "arguments. For example: {'qualname': "
1116
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
1117
+ "{'param': 'value'}}."
1118
+ ),
1119
+ )
1120
+
1121
+ return_tokens_as_token_ids: bool | None = Field(
1122
+ default=None,
1123
+ description=(
1124
+ "If specified with 'logprobs', tokens are represented "
1125
+ " as strings of the form 'token_id:{token_id}' so that tokens "
1126
+ "that are not JSON-encodable can be identified."
1127
+ ),
1128
+ )
1129
+ return_token_ids: bool | None = Field(
1130
+ default=None,
1131
+ description=(
1132
+ "If specified, the result will include token IDs alongside the "
1133
+ "generated text. In streaming mode, prompt_token_ids is included "
1134
+ "only in the first chunk, and token_ids contains the delta tokens "
1135
+ "for each chunk. This is useful for debugging or when you "
1136
+ "need to map generated text back to input tokens."
1137
+ ),
1138
+ )
1139
+
1140
+ cache_salt: str | None = Field(
1141
+ default=None,
1142
+ description=(
1143
+ "If specified, the prefix cache will be salted with the provided "
1144
+ "string to prevent an attacker to guess prompts in multi-user "
1145
+ "environments. The salt should be random, protected from "
1146
+ "access by 3rd parties, and long enough to be "
1147
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
1148
+ "to 256 bit)."
1149
+ ),
1150
+ )
1151
+
1152
+ kv_transfer_params: dict[str, Any] | None = Field(
1153
+ default=None,
1154
+ description="KVTransfer parameters used for disaggregated serving.",
1155
+ )
1156
+
1157
+ vllm_xargs: dict[str, str | int | float] | None = Field(
1158
+ default=None,
1159
+ description=(
1160
+ "Additional request parameters with string or "
1161
+ "numeric values, used by custom extensions."
1162
+ ),
1163
+ )
1164
+
1165
+ # --8<-- [end:completion-extra-params]
1166
+
1167
+ # Default sampling parameters for completion requests
1168
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1169
+ "repetition_penalty": 1.0,
1170
+ "temperature": 1.0,
1171
+ "top_p": 1.0,
1172
+ "top_k": 0,
1173
+ "min_p": 0.0,
1174
+ }
1175
+
1176
+ def to_beam_search_params(
1177
+ self,
1178
+ max_tokens: int,
1179
+ default_sampling_params: dict | None = None,
1180
+ ) -> BeamSearchParams:
1181
+ if default_sampling_params is None:
1182
+ default_sampling_params = {}
1183
+ n = self.n if self.n is not None else 1
1184
+
1185
+ if (temperature := self.temperature) is None:
1186
+ temperature = default_sampling_params.get("temperature", 1.0)
1187
+
1188
+ return BeamSearchParams(
1189
+ beam_width=n,
1190
+ max_tokens=max_tokens,
1191
+ ignore_eos=self.ignore_eos,
1192
+ temperature=temperature,
1193
+ length_penalty=self.length_penalty,
1194
+ include_stop_str_in_output=self.include_stop_str_in_output,
1195
+ )
1196
+
1197
+ def to_sampling_params(
1198
+ self,
1199
+ max_tokens: int,
1200
+ logits_processor_pattern: str | None,
1201
+ default_sampling_params: dict | None = None,
1202
+ ) -> SamplingParams:
1203
+ if default_sampling_params is None:
1204
+ default_sampling_params = {}
1205
+
1206
+ # Default parameters
1207
+ if (repetition_penalty := self.repetition_penalty) is None:
1208
+ repetition_penalty = default_sampling_params.get(
1209
+ "repetition_penalty",
1210
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
1211
+ )
1212
+ if (temperature := self.temperature) is None:
1213
+ temperature = default_sampling_params.get(
1214
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
1215
+ )
1216
+ if (top_p := self.top_p) is None:
1217
+ top_p = default_sampling_params.get(
1218
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
1219
+ )
1220
+ if (top_k := self.top_k) is None:
1221
+ top_k = default_sampling_params.get(
1222
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
1223
+ )
1224
+ if (min_p := self.min_p) is None:
1225
+ min_p = default_sampling_params.get(
1226
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
1227
+ )
1228
+
1229
+ prompt_logprobs = self.prompt_logprobs
1230
+ if prompt_logprobs is None and self.echo:
1231
+ prompt_logprobs = self.logprobs
1232
+
1233
+ echo_without_generation = self.echo and self.max_tokens == 0
1234
+
1235
+ response_format = self.response_format
1236
+ if response_format is not None:
1237
+ # If structured outputs wasn't already enabled,
1238
+ # we must enable it for these features to work
1239
+ if self.structured_outputs is None:
1240
+ self.structured_outputs = StructuredOutputsParams()
1241
+
1242
+ # Set structured output params for response format
1243
+ if response_format.type == "json_object":
1244
+ self.structured_outputs.json_object = True
1245
+ elif response_format.type == "json_schema":
1246
+ json_schema = response_format.json_schema
1247
+ assert json_schema is not None
1248
+ self.structured_outputs.json = json_schema.json_schema
1249
+ elif response_format.type == "structural_tag":
1250
+ structural_tag = response_format
1251
+ assert structural_tag is not None and isinstance(
1252
+ structural_tag,
1253
+ (
1254
+ LegacyStructuralTagResponseFormat,
1255
+ StructuralTagResponseFormat,
1256
+ ),
1257
+ )
1258
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
1259
+ self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
1260
+
1261
+ extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
1262
+ if self.kv_transfer_params:
1263
+ # Pass in kv_transfer_params via extra_args
1264
+ extra_args["kv_transfer_params"] = self.kv_transfer_params
1265
+ return SamplingParams.from_optional(
1266
+ n=self.n,
1267
+ presence_penalty=self.presence_penalty,
1268
+ frequency_penalty=self.frequency_penalty,
1269
+ repetition_penalty=repetition_penalty,
1270
+ temperature=temperature,
1271
+ top_p=top_p,
1272
+ top_k=top_k,
1273
+ min_p=min_p,
1274
+ seed=self.seed,
1275
+ stop=self.stop,
1276
+ stop_token_ids=self.stop_token_ids,
1277
+ logprobs=self.logprobs,
1278
+ ignore_eos=self.ignore_eos,
1279
+ max_tokens=max_tokens if not echo_without_generation else 1,
1280
+ min_tokens=self.min_tokens,
1281
+ prompt_logprobs=prompt_logprobs,
1282
+ skip_special_tokens=self.skip_special_tokens,
1283
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
1284
+ include_stop_str_in_output=self.include_stop_str_in_output,
1285
+ logits_processors=get_logits_processors(
1286
+ self.logits_processors, logits_processor_pattern
1287
+ ),
1288
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
1289
+ output_kind=RequestOutputKind.DELTA
1290
+ if self.stream
1291
+ else RequestOutputKind.FINAL_ONLY,
1292
+ structured_outputs=self.structured_outputs,
1293
+ logit_bias=self.logit_bias,
1294
+ allowed_token_ids=self.allowed_token_ids,
1295
+ extra_args=extra_args or None,
1296
+ skip_clone=True, # Created fresh per request, safe to skip clone
1297
+ )
1298
+
1299
+ @model_validator(mode="before")
1300
+ @classmethod
1301
+ def check_structured_outputs_count(cls, data):
1302
+ if data.get("structured_outputs", None) is None:
1303
+ return data
1304
+
1305
+ structured_outputs_kwargs = data["structured_outputs"]
1306
+ count = sum(
1307
+ structured_outputs_kwargs.get(k) is not None
1308
+ for k in ("json", "regex", "choice")
1309
+ )
1310
+ if count > 1:
1311
+ raise VLLMValidationError(
1312
+ "You can only use one kind of constraints for structured "
1313
+ "outputs ('json', 'regex' or 'choice').",
1314
+ parameter="structured_outputs",
1315
+ )
1316
+ return data
1317
+
1318
+ @model_validator(mode="before")
1319
+ @classmethod
1320
+ def check_logprobs(cls, data):
1321
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1322
+ if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
1323
+ raise VLLMValidationError(
1324
+ "`prompt_logprobs` are not available when `stream=True`.",
1325
+ parameter="prompt_logprobs",
1326
+ )
1327
+
1328
+ if prompt_logprobs < 0 and prompt_logprobs != -1:
1329
+ raise VLLMValidationError(
1330
+ "`prompt_logprobs` must be a positive value or -1.",
1331
+ parameter="prompt_logprobs",
1332
+ value=prompt_logprobs,
1333
+ )
1334
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1335
+ raise VLLMValidationError(
1336
+ "`logprobs` must be a positive value.",
1337
+ parameter="logprobs",
1338
+ value=logprobs,
1339
+ )
1340
+
1341
+ return data
1342
+
1343
+ @model_validator(mode="before")
1344
+ @classmethod
1345
+ def validate_stream_options(cls, data):
1346
+ if data.get("stream_options") and not data.get("stream"):
1347
+ raise VLLMValidationError(
1348
+ "Stream options can only be defined when `stream=True`.",
1349
+ parameter="stream_options",
1350
+ )
1351
+
1352
+ return data
1353
+
1354
+ @model_validator(mode="before")
1355
+ @classmethod
1356
+ def validate_prompt_and_prompt_embeds(cls, data):
1357
+ prompt = data.get("prompt")
1358
+ prompt_embeds = data.get("prompt_embeds")
1359
+
1360
+ prompt_is_empty = prompt is None or (isinstance(prompt, str) and prompt == "")
1361
+ embeds_is_empty = prompt_embeds is None or (
1362
+ isinstance(prompt_embeds, list) and len(prompt_embeds) == 0
1363
+ )
1364
+
1365
+ if prompt_is_empty and embeds_is_empty:
1366
+ raise ValueError(
1367
+ "Either prompt or prompt_embeds must be provided and non-empty."
1368
+ )
1369
+
1370
+ return data
1371
+
1372
+ @model_validator(mode="before")
1373
+ @classmethod
1374
+ def check_cache_salt_support(cls, data):
1375
+ if data.get("cache_salt") is not None and (
1376
+ not isinstance(data["cache_salt"], str) or not data["cache_salt"]
1377
+ ):
1378
+ raise ValueError(
1379
+ "Parameter 'cache_salt' must be a non-empty string if provided."
1380
+ )
1381
+ return data
1382
+
1383
+
1384
+ class CompletionLogProbs(OpenAIBaseModel):
1385
+ text_offset: list[int] = Field(default_factory=list)
1386
+ token_logprobs: list[float | None] = Field(default_factory=list)
1387
+ tokens: list[str] = Field(default_factory=list)
1388
+ top_logprobs: list[dict[str, float] | None] = Field(default_factory=list)
1389
+
1390
+
1391
+ class CompletionResponseChoice(OpenAIBaseModel):
1392
+ index: int
1393
+ text: str
1394
+ logprobs: CompletionLogProbs | None = None
1395
+ finish_reason: str | None = None
1396
+ stop_reason: int | str | None = Field(
1397
+ default=None,
1398
+ description=(
1399
+ "The stop string or token id that caused the completion "
1400
+ "to stop, None if the completion finished for some other reason "
1401
+ "including encountering the EOS token"
1402
+ ),
1403
+ )
1404
+ token_ids: list[int] | None = None # For response
1405
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
1406
+ prompt_token_ids: list[int] | None = None # For prompt
1407
+
1408
+
1409
+ class CompletionResponse(OpenAIBaseModel):
1410
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1411
+ object: Literal["text_completion"] = "text_completion"
1412
+ created: int = Field(default_factory=lambda: int(time.time()))
1413
+ model: str
1414
+ choices: list[CompletionResponseChoice]
1415
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
1416
+ system_fingerprint: str | None = None
1417
+ usage: UsageInfo
1418
+
1419
+ # vLLM-specific fields that are not in OpenAI spec
1420
+ kv_transfer_params: dict[str, Any] | None = Field(
1421
+ default=None, description="KVTransfer parameters."
1422
+ )
1423
+
1424
+
1425
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1426
+ index: int
1427
+ text: str
1428
+ logprobs: CompletionLogProbs | None = None
1429
+ finish_reason: str | None = None
1430
+ stop_reason: int | str | None = Field(
1431
+ default=None,
1432
+ description=(
1433
+ "The stop string or token id that caused the completion "
1434
+ "to stop, None if the completion finished for some other reason "
1435
+ "including encountering the EOS token"
1436
+ ),
1437
+ )
1438
+ # not part of the OpenAI spec but for tracing the tokens
1439
+ # prompt tokens is put into choice to align with CompletionResponseChoice
1440
+ prompt_token_ids: list[int] | None = None
1441
+ token_ids: list[int] | None = None
1442
+
1443
+
1444
+ class CompletionStreamResponse(OpenAIBaseModel):
1445
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1446
+ object: str = "text_completion"
1447
+ created: int = Field(default_factory=lambda: int(time.time()))
1448
+ model: str
1449
+ choices: list[CompletionResponseStreamChoice]
1450
+ usage: UsageInfo | None = Field(default=None)
1451
+
1452
+
1453
+ class FunctionCall(OpenAIBaseModel):
1454
+ name: str
1455
+ arguments: str
1456
+
1457
+
1458
+ class ToolCall(OpenAIBaseModel):
1459
+ id: str = Field(default_factory=make_tool_call_id)
1460
+ type: Literal["function"] = "function"
1461
+ function: FunctionCall
1462
+
1463
+
1464
+ class DeltaFunctionCall(BaseModel):
1465
+ name: str | None = None
1466
+ arguments: str | None = None
1467
+
1468
+
1469
+ # a tool call delta where everything is optional
1470
+ class DeltaToolCall(OpenAIBaseModel):
1471
+ id: str | None = None
1472
+ type: Literal["function"] | None = None
1473
+ index: int
1474
+ function: DeltaFunctionCall | None = None
1475
+
1476
+
1477
+ class ExtractedToolCallInformation(BaseModel):
1478
+ # indicate if tools were called
1479
+ tools_called: bool
1480
+
1481
+ # extracted tool calls
1482
+ tool_calls: list[ToolCall]
1483
+
1484
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1485
+ # But some models will do this intentionally
1486
+ content: str | None = None
1487
+
1488
+
1489
+ class ChatMessage(OpenAIBaseModel):
1490
+ role: str
1491
+ content: str | None = None
1492
+ refusal: str | None = None
1493
+ annotations: OpenAIAnnotation | None = None
1494
+ audio: OpenAIChatCompletionAudio | None = None
1495
+ function_call: FunctionCall | None = None
1496
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1497
+
1498
+ # vLLM-specific fields that are not in OpenAI spec
1499
+ reasoning: str | None = None
1500
+ reasoning_content: str | None = None
1501
+ """Deprecated: use `reasoning` instead."""
1502
+
1503
+ @model_validator(mode="after")
1504
+ def handle_deprecated_reasoning_content(self):
1505
+ """Copy reasoning to reasoning_content for backward compatibility."""
1506
+ self.reasoning_content = self.reasoning
1507
+ return self
1508
+
1509
+
1510
+ class ChatCompletionLogProb(OpenAIBaseModel):
1511
+ token: str
1512
+ logprob: float = -9999.0
1513
+ bytes: list[int] | None = None
1514
+
1515
+
1516
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1517
+ # Workaround: redefine fields name cache so that it's not
1518
+ # shared with the super class.
1519
+ field_names: ClassVar[set[str] | None] = None
1520
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1521
+
1522
+
1523
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1524
+ content: list[ChatCompletionLogProbsContent] | None = None
1525
+
1526
+
1527
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1528
+ index: int
1529
+ message: ChatMessage
1530
+ logprobs: ChatCompletionLogProbs | None = None
1531
+ # per OpenAI spec this is the default
1532
+ finish_reason: str | None = "stop"
1533
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1534
+ stop_reason: int | str | None = None
1535
+ # not part of the OpenAI spec but is useful for tracing the tokens
1536
+ # in agent scenarios
1537
+ token_ids: list[int] | None = None
1538
+
1539
+
1540
+ class ChatCompletionResponse(OpenAIBaseModel):
1541
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1542
+ object: Literal["chat.completion"] = "chat.completion"
1543
+ created: int = Field(default_factory=lambda: int(time.time()))
1544
+ model: str
1545
+ choices: list[ChatCompletionResponseChoice]
1546
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
1547
+ system_fingerprint: str | None = None
1548
+ usage: UsageInfo
1549
+
1550
+ # vLLM-specific fields that are not in OpenAI spec
1551
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
1552
+ prompt_token_ids: list[int] | None = None
1553
+ kv_transfer_params: dict[str, Any] | None = Field(
1554
+ default=None, description="KVTransfer parameters."
1555
+ )
1556
+
1557
+
1558
+ class DeltaMessage(OpenAIBaseModel):
1559
+ role: str | None = None
1560
+ content: str | None = None
1561
+ reasoning: str | None = None
1562
+ reasoning_content: str | None = None
1563
+ """Deprecated: use `reasoning` instead."""
1564
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1565
+
1566
+ @model_validator(mode="after")
1567
+ def handle_deprecated_reasoning_content(self):
1568
+ """Copy reasoning to reasoning_content for backward compatibility."""
1569
+ self.reasoning_content = self.reasoning
1570
+ return self
1571
+
1572
+
1573
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1574
+ index: int
1575
+ delta: DeltaMessage
1576
+ logprobs: ChatCompletionLogProbs | None = None
1577
+ finish_reason: str | None = None
1578
+ stop_reason: int | str | None = None
1579
+ # not part of the OpenAI spec but for tracing the tokens
1580
+ token_ids: list[int] | None = None
1581
+
1582
+
1583
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1584
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1585
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1586
+ created: int = Field(default_factory=lambda: int(time.time()))
1587
+ model: str
1588
+ choices: list[ChatCompletionResponseStreamChoice]
1589
+ usage: UsageInfo | None = Field(default=None)
1590
+ # not part of the OpenAI spec but for tracing the tokens
1591
+ prompt_token_ids: list[int] | None = None
1592
+
1593
+
1594
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1595
+ delta: DeltaMessage
1596
+ finish_reason: str | None = None
1597
+ stop_reason: int | str | None = None
1598
+
1599
+
1600
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1601
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1602
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1603
+ created: int = Field(default_factory=lambda: int(time.time()))
1604
+ model: str
1605
+ choices: list[TranscriptionResponseStreamChoice]
1606
+ usage: UsageInfo | None = Field(default=None)
1607
+
1608
+
1609
+ class InputTokensDetails(OpenAIBaseModel):
1610
+ cached_tokens: int
1611
+ input_tokens_per_turn: list[int] = Field(default_factory=list)
1612
+ cached_tokens_per_turn: list[int] = Field(default_factory=list)
1613
+
1614
+
1615
+ class OutputTokensDetails(OpenAIBaseModel):
1616
+ reasoning_tokens: int = 0
1617
+ tool_output_tokens: int = 0
1618
+ output_tokens_per_turn: list[int] = Field(default_factory=list)
1619
+ tool_output_tokens_per_turn: list[int] = Field(default_factory=list)
1620
+
1621
+
1622
+ class ResponseUsage(OpenAIBaseModel):
1623
+ input_tokens: int
1624
+ input_tokens_details: InputTokensDetails
1625
+ output_tokens: int
1626
+ output_tokens_details: OutputTokensDetails
1627
+ total_tokens: int
1628
+
1629
+
1630
+ def serialize_message(msg):
1631
+ """
1632
+ Serializes a single message
1633
+ """
1634
+ if isinstance(msg, dict):
1635
+ return msg
1636
+ elif hasattr(msg, "to_dict"):
1637
+ return msg.to_dict()
1638
+ else:
1639
+ # fallback to pyandic dump
1640
+ return msg.model_dump_json()
1641
+
1642
+
1643
+ def serialize_messages(msgs):
1644
+ """
1645
+ Serializes multiple messages
1646
+ """
1647
+ return [serialize_message(msg) for msg in msgs] if msgs else None
1648
+
1649
+
1650
+ class ResponseRawMessageAndToken(OpenAIBaseModel):
1651
+ """Class to show the raw message.
1652
+ If message / tokens diverge, tokens is the source of truth"""
1653
+
1654
+ message: str
1655
+ tokens: list[int]
1656
+ type: Literal["raw_message_tokens"] = "raw_message_tokens"
1657
+
1658
+
1659
+ ResponseInputOutputMessage: TypeAlias = (
1660
+ list[ChatCompletionMessageParam] | list[ResponseRawMessageAndToken]
1661
+ )
1662
+
1663
+
1664
+ class ResponsesResponse(OpenAIBaseModel):
1665
+ id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
1666
+ created_at: int = Field(default_factory=lambda: int(time.time()))
1667
+ # error: Optional[ResponseError] = None
1668
+ incomplete_details: IncompleteDetails | None = None
1669
+ instructions: str | None = None
1670
+ metadata: Metadata | None = None
1671
+ model: str
1672
+ object: Literal["response"] = "response"
1673
+ output: list[ResponseOutputItem]
1674
+ parallel_tool_calls: bool
1675
+ temperature: float
1676
+ tool_choice: ToolChoice
1677
+ tools: list[Tool]
1678
+ top_p: float
1679
+ background: bool
1680
+ max_output_tokens: int
1681
+ max_tool_calls: int | None = None
1682
+ previous_response_id: str | None = None
1683
+ prompt: ResponsePrompt | None = None
1684
+ reasoning: Reasoning | None = None
1685
+ service_tier: Literal["auto", "default", "flex", "scale", "priority"]
1686
+ status: ResponseStatus
1687
+ text: ResponseTextConfig | None = None
1688
+ top_logprobs: int | None = None
1689
+ truncation: Literal["auto", "disabled"]
1690
+ usage: ResponseUsage | None = None
1691
+ user: str | None = None
1692
+
1693
+ # --8<-- [start:responses-response-extra-params]
1694
+ # These are populated when enable_response_messages is set to True
1695
+ # NOTE: custom serialization is needed
1696
+ # see serialize_input_messages and serialize_output_messages
1697
+ input_messages: ResponseInputOutputMessage | None = Field(
1698
+ default=None,
1699
+ description=(
1700
+ "If enable_response_messages, we can show raw token input to model."
1701
+ ),
1702
+ )
1703
+ output_messages: ResponseInputOutputMessage | None = Field(
1704
+ default=None,
1705
+ description=(
1706
+ "If enable_response_messages, we can show raw token output of model."
1707
+ ),
1708
+ )
1709
+ # --8<-- [end:responses-response-extra-params]
1710
+
1711
+ # NOTE: openAI harmony doesn't serialize TextContent properly,
1712
+ # TODO: this fixes for TextContent, but need to verify for tools etc
1713
+ # https://github.com/openai/harmony/issues/78
1714
+ @field_serializer("output_messages", when_used="json")
1715
+ def serialize_output_messages(self, msgs, _info):
1716
+ return serialize_messages(msgs)
1717
+
1718
+ # NOTE: openAI harmony doesn't serialize TextContent properly, this fixes it
1719
+ # https://github.com/openai/harmony/issues/78
1720
+ @field_serializer("input_messages", when_used="json")
1721
+ def serialize_input_messages(self, msgs, _info):
1722
+ return serialize_messages(msgs)
1723
+
1724
+ @classmethod
1725
+ def from_request(
1726
+ cls,
1727
+ request: ResponsesRequest,
1728
+ sampling_params: SamplingParams,
1729
+ model_name: str,
1730
+ created_time: int,
1731
+ output: list[ResponseOutputItem],
1732
+ status: ResponseStatus,
1733
+ usage: ResponseUsage | None = None,
1734
+ input_messages: ResponseInputOutputMessage | None = None,
1735
+ output_messages: ResponseInputOutputMessage | None = None,
1736
+ ) -> "ResponsesResponse":
1737
+ incomplete_details: IncompleteDetails | None = None
1738
+ if status == "incomplete":
1739
+ incomplete_details = IncompleteDetails(reason="max_output_tokens")
1740
+ # TODO: implement the other reason for incomplete_details,
1741
+ # which is content_filter
1742
+ # incomplete_details = IncompleteDetails(reason='content_filter')
1743
+ return cls(
1744
+ id=request.request_id,
1745
+ created_at=created_time,
1746
+ incomplete_details=incomplete_details,
1747
+ instructions=request.instructions,
1748
+ metadata=request.metadata,
1749
+ model=model_name,
1750
+ output=output,
1751
+ input_messages=input_messages,
1752
+ output_messages=output_messages,
1753
+ parallel_tool_calls=request.parallel_tool_calls,
1754
+ temperature=sampling_params.temperature,
1755
+ tool_choice=request.tool_choice,
1756
+ tools=request.tools,
1757
+ top_p=sampling_params.top_p,
1758
+ background=request.background,
1759
+ max_output_tokens=sampling_params.max_tokens,
1760
+ max_tool_calls=request.max_tool_calls,
1761
+ previous_response_id=request.previous_response_id,
1762
+ prompt=request.prompt,
1763
+ reasoning=request.reasoning,
1764
+ service_tier=request.service_tier,
1765
+ status=status,
1766
+ text=request.text,
1767
+ top_logprobs=sampling_params.logprobs,
1768
+ truncation=request.truncation,
1769
+ user=request.user,
1770
+ usage=usage,
1771
+ )
1772
+
1773
+
1774
+ # TODO: this code can be removed once
1775
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
1776
+ class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
1777
+ content_index: int
1778
+ """The index of the content part that is done."""
1779
+
1780
+ item_id: str
1781
+ """The ID of the output item that the content part was added to."""
1782
+
1783
+ output_index: int
1784
+ """The index of the output item that the content part was added to."""
1785
+
1786
+ part: ResponseReasoningTextContent
1787
+ """The content part that is done."""
1788
+
1789
+ sequence_number: int
1790
+ """The sequence number of this event."""
1791
+
1792
+ type: Literal["response.reasoning_part.done"]
1793
+ """The type of the event. Always `response.reasoning_part.done`."""
1794
+
1795
+
1796
+ # TODO: this code can be removed once
1797
+ # https://github.com/openai/openai-python/issues/2634 has been resolved
1798
+ class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
1799
+ content_index: int
1800
+ """The index of the content part that is done."""
1801
+
1802
+ item_id: str
1803
+ """The ID of the output item that the content part was added to."""
1804
+
1805
+ output_index: int
1806
+ """The index of the output item that the content part was added to."""
1807
+
1808
+ part: ResponseReasoningTextContent
1809
+ """The content part that is done."""
1810
+
1811
+ sequence_number: int
1812
+ """The sequence number of this event."""
1813
+
1814
+ type: Literal["response.reasoning_part.added"]
1815
+ """The type of the event. Always `response.reasoning_part.added`."""
1816
+
1817
+
1818
+ # vLLM Streaming Events
1819
+ # Note: we override the response type with the vLLM ResponsesResponse type
1820
+ class ResponseCompletedEvent(OpenAIResponseCompletedEvent):
1821
+ response: ResponsesResponse # type: ignore[override]
1822
+
1823
+
1824
+ class ResponseCreatedEvent(OpenAIResponseCreatedEvent):
1825
+ response: ResponsesResponse # type: ignore[override]
1826
+
1827
+
1828
+ class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
1829
+ response: ResponsesResponse # type: ignore[override]
1830
+
1831
+
1832
+ StreamingResponsesResponse: TypeAlias = (
1833
+ ResponseCreatedEvent
1834
+ | ResponseInProgressEvent
1835
+ | ResponseCompletedEvent
1836
+ | ResponseOutputItemAddedEvent
1837
+ | ResponseOutputItemDoneEvent
1838
+ | ResponseContentPartAddedEvent
1839
+ | ResponseContentPartDoneEvent
1840
+ | ResponseReasoningTextDeltaEvent
1841
+ | ResponseReasoningTextDoneEvent
1842
+ | ResponseReasoningPartAddedEvent
1843
+ | ResponseReasoningPartDoneEvent
1844
+ | ResponseCodeInterpreterCallInProgressEvent
1845
+ | ResponseCodeInterpreterCallCodeDeltaEvent
1846
+ | ResponseWebSearchCallInProgressEvent
1847
+ | ResponseWebSearchCallSearchingEvent
1848
+ | ResponseWebSearchCallCompletedEvent
1849
+ | ResponseCodeInterpreterCallCodeDoneEvent
1850
+ | ResponseCodeInterpreterCallInterpretingEvent
1851
+ | ResponseCodeInterpreterCallCompletedEvent
1852
+ | ResponseMcpCallArgumentsDeltaEvent
1853
+ | ResponseMcpCallArgumentsDoneEvent
1854
+ | ResponseMcpCallInProgressEvent
1855
+ | ResponseMcpCallCompletedEvent
1856
+ )
1857
+
1858
+
1859
+ class TokenizeCompletionRequest(OpenAIBaseModel):
1860
+ model: str | None = None
1861
+ prompt: str
1862
+
1863
+ add_special_tokens: bool = Field(
1864
+ default=True,
1865
+ description=(
1866
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1867
+ "the prompt."
1868
+ ),
1869
+ )
1870
+ return_token_strs: bool | None = Field(
1871
+ default=False,
1872
+ description=(
1873
+ "If true, also return the token strings corresponding to the token ids."
1874
+ ),
1875
+ )
1876
+
1877
+
1878
+ class TokenizeChatRequest(OpenAIBaseModel):
1879
+ model: str | None = None
1880
+ messages: list[ChatCompletionMessageParam]
1881
+
1882
+ add_generation_prompt: bool = Field(
1883
+ default=True,
1884
+ description=(
1885
+ "If true, the generation prompt will be added to the chat template. "
1886
+ "This is a parameter used by chat template in tokenizer config of the "
1887
+ "model."
1888
+ ),
1889
+ )
1890
+ return_token_strs: bool | None = Field(
1891
+ default=False,
1892
+ description=(
1893
+ "If true, also return the token strings corresponding to the token ids."
1894
+ ),
1895
+ )
1896
+ continue_final_message: bool = Field(
1897
+ default=False,
1898
+ description=(
1899
+ "If this is set, the chat will be formatted so that the final "
1900
+ "message in the chat is open-ended, without any EOS tokens. The "
1901
+ "model will continue this message rather than starting a new one. "
1902
+ 'This allows you to "prefill" part of the model\'s response for it. '
1903
+ "Cannot be used at the same time as `add_generation_prompt`."
1904
+ ),
1905
+ )
1906
+ add_special_tokens: bool = Field(
1907
+ default=False,
1908
+ description=(
1909
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1910
+ "on top of what is added by the chat template. "
1911
+ "For most models, the chat template takes care of adding the "
1912
+ "special tokens so this should be set to false (as is the "
1913
+ "default)."
1914
+ ),
1915
+ )
1916
+ chat_template: str | None = Field(
1917
+ default=None,
1918
+ description=(
1919
+ "A Jinja template to use for this conversion. "
1920
+ "As of transformers v4.44, default chat template is no longer "
1921
+ "allowed, so you must provide a chat template if the tokenizer "
1922
+ "does not define one."
1923
+ ),
1924
+ )
1925
+ chat_template_kwargs: dict[str, Any] | None = Field(
1926
+ default=None,
1927
+ description=(
1928
+ "Additional keyword args to pass to the template renderer. "
1929
+ "Will be accessible by the chat template."
1930
+ ),
1931
+ )
1932
+ mm_processor_kwargs: dict[str, Any] | None = Field(
1933
+ default=None,
1934
+ description=("Additional kwargs to pass to the HF processor."),
1935
+ )
1936
+ tools: list[ChatCompletionToolsParam] | None = Field(
1937
+ default=None,
1938
+ description=("A list of tools the model may call."),
1939
+ )
1940
+
1941
+ @model_validator(mode="before")
1942
+ @classmethod
1943
+ def check_generation_prompt(cls, data):
1944
+ if data.get("continue_final_message") and data.get("add_generation_prompt"):
1945
+ raise ValueError(
1946
+ "Cannot set both `continue_final_message` and "
1947
+ "`add_generation_prompt` to True."
1948
+ )
1949
+ return data
1950
+
1951
+
1952
+ TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
1953
+
1954
+
1955
+ class TokenizeResponse(OpenAIBaseModel):
1956
+ count: int
1957
+ max_model_len: int
1958
+ tokens: list[int]
1959
+ token_strs: list[str] | None = None
1960
+
1961
+
1962
+ class DetokenizeRequest(OpenAIBaseModel):
1963
+ model: str | None = None
1964
+ tokens: list[int]
1965
+
1966
+
1967
+ class DetokenizeResponse(OpenAIBaseModel):
1968
+ prompt: str
1969
+
1970
+
1971
+ class TokenizerInfoResponse(OpenAIBaseModel):
1972
+ """
1973
+ Response containing tokenizer configuration
1974
+ equivalent to tokenizer_config.json
1975
+ """
1976
+
1977
+ model_config = ConfigDict(extra="allow")
1978
+ tokenizer_class: str
1979
+
1980
+
1981
+ class LoadLoRAAdapterRequest(BaseModel):
1982
+ lora_name: str
1983
+ lora_path: str
1984
+
1985
+
1986
+ class UnloadLoRAAdapterRequest(BaseModel):
1987
+ lora_name: str
1988
+ lora_int_id: int | None = Field(default=None)
1989
+
1990
+
1991
+ ## Protocols for Audio
1992
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
1993
+
1994
+
1995
+ class TranscriptionRequest(OpenAIBaseModel):
1996
+ # Ordered by official OpenAI API documentation
1997
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
1998
+
1999
+ file: UploadFile
2000
+ """
2001
+ The audio file object (not file name) to transcribe, in one of these
2002
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2003
+ """
2004
+
2005
+ model: str | None = None
2006
+ """ID of the model to use.
2007
+ """
2008
+
2009
+ language: str | None = None
2010
+ """The language of the input audio.
2011
+
2012
+ Supplying the input language in
2013
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2014
+ will improve accuracy and latency.
2015
+ """
2016
+
2017
+ prompt: str = Field(default="")
2018
+ """An optional text to guide the model's style or continue a previous audio
2019
+ segment.
2020
+
2021
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2022
+ should match the audio language.
2023
+ """
2024
+
2025
+ response_format: AudioResponseFormat = Field(default="json")
2026
+ """
2027
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2028
+ `verbose_json`, or `vtt`.
2029
+ """
2030
+
2031
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
2032
+
2033
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
2034
+ alias="timestamp_granularities[]", default=[]
2035
+ )
2036
+ """The timestamp granularities to populate for this transcription.
2037
+
2038
+ `response_format` must be set `verbose_json` to use timestamp granularities.
2039
+ Either or both of these options are supported: `word`, or `segment`. Note:
2040
+ There is no additional latency for segment timestamps, but generating word
2041
+ timestamps incurs additional latency.
2042
+ """
2043
+
2044
+ stream: bool | None = False
2045
+ """When set, it will enable output to be streamed in a similar fashion
2046
+ as the Chat Completion endpoint.
2047
+ """
2048
+ # --8<-- [start:transcription-extra-params]
2049
+ # Flattened stream option to simplify form data.
2050
+ stream_include_usage: bool | None = False
2051
+ stream_continuous_usage_stats: bool | None = False
2052
+
2053
+ vllm_xargs: dict[str, str | int | float] | None = Field(
2054
+ default=None,
2055
+ description=(
2056
+ "Additional request parameters with string or "
2057
+ "numeric values, used by custom extensions."
2058
+ ),
2059
+ )
2060
+ # --8<-- [end:transcription-extra-params]
2061
+
2062
+ to_language: str | None = None
2063
+ """The language of the output audio we transcribe to.
2064
+
2065
+ Please note that this is not currently used by supported models at this
2066
+ time, but it is a placeholder for future use, matching translation api.
2067
+ """
2068
+
2069
+ # --8<-- [start:transcription-sampling-params]
2070
+ temperature: float = Field(default=0.0)
2071
+ """The sampling temperature, between 0 and 1.
2072
+
2073
+ Higher values like 0.8 will make the output more random, while lower values
2074
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2075
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2076
+ to automatically increase the temperature until certain thresholds are hit.
2077
+ """
2078
+
2079
+ top_p: float | None = None
2080
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
2081
+ smallest possible set whose cumulative probability exceeds `p`.
2082
+ """
2083
+
2084
+ top_k: int | None = None
2085
+ """Limits sampling to the `k` most probable tokens at each step."""
2086
+
2087
+ min_p: float | None = None
2088
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
2089
+ minimum likelihood threshold during sampling.
2090
+ """
2091
+
2092
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2093
+ """The seed to use for sampling."""
2094
+
2095
+ frequency_penalty: float | None = 0.0
2096
+ """The frequency penalty to use for sampling."""
2097
+
2098
+ repetition_penalty: float | None = None
2099
+ """The repetition penalty to use for sampling."""
2100
+
2101
+ presence_penalty: float | None = 0.0
2102
+ """The presence penalty to use for sampling."""
2103
+
2104
+ max_completion_tokens: int | None = None
2105
+ """The maximum number of tokens to generate."""
2106
+ # --8<-- [end:transcription-sampling-params]
2107
+
2108
+ # Default sampling parameters for transcription requests.
2109
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2110
+ "repetition_penalty": 1.0,
2111
+ "temperature": 1.0,
2112
+ "top_p": 1.0,
2113
+ "top_k": 0,
2114
+ "min_p": 0.0,
2115
+ }
2116
+
2117
+ def to_sampling_params(
2118
+ self, default_max_tokens: int, default_sampling_params: dict | None = None
2119
+ ) -> SamplingParams:
2120
+ max_tokens = default_max_tokens
2121
+
2122
+ if default_sampling_params is None:
2123
+ default_sampling_params = {}
2124
+
2125
+ # Default parameters
2126
+ if (temperature := self.temperature) is None:
2127
+ temperature = default_sampling_params.get(
2128
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
2129
+ )
2130
+ if (top_p := self.top_p) is None:
2131
+ top_p = default_sampling_params.get(
2132
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
2133
+ )
2134
+ if (top_k := self.top_k) is None:
2135
+ top_k = default_sampling_params.get(
2136
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
2137
+ )
2138
+ if (min_p := self.min_p) is None:
2139
+ min_p = default_sampling_params.get(
2140
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
2141
+ )
2142
+
2143
+ if (repetition_penalty := self.repetition_penalty) is None:
2144
+ repetition_penalty = default_sampling_params.get(
2145
+ "repetition_penalty",
2146
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
2147
+ )
2148
+
2149
+ return SamplingParams.from_optional(
2150
+ temperature=temperature,
2151
+ max_tokens=max_tokens,
2152
+ seed=self.seed,
2153
+ top_p=top_p,
2154
+ top_k=top_k,
2155
+ min_p=min_p,
2156
+ frequency_penalty=self.frequency_penalty,
2157
+ repetition_penalty=repetition_penalty,
2158
+ presence_penalty=self.presence_penalty,
2159
+ output_kind=RequestOutputKind.DELTA
2160
+ if self.stream
2161
+ else RequestOutputKind.FINAL_ONLY,
2162
+ extra_args=self.vllm_xargs,
2163
+ skip_clone=True, # Created fresh per request, safe to skip clone
2164
+ )
2165
+
2166
+ @model_validator(mode="before")
2167
+ @classmethod
2168
+ def validate_transcription_request(cls, data):
2169
+ if isinstance(data.get("file"), str):
2170
+ raise HTTPException(
2171
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
2172
+ detail="Expected 'file' to be a file-like object, not 'str'.",
2173
+ )
2174
+
2175
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2176
+ stream = data.get("stream", False)
2177
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2178
+ # Find which specific stream option was set
2179
+ invalid_param = next(
2180
+ (so for so in stream_opts if data.get(so, False)),
2181
+ "stream_include_usage",
2182
+ )
2183
+ raise VLLMValidationError(
2184
+ "Stream options can only be defined when `stream=True`.",
2185
+ parameter=invalid_param,
2186
+ )
2187
+
2188
+ return data
2189
+
2190
+
2191
+ # Transcription response objects
2192
+ class TranscriptionUsageAudio(OpenAIBaseModel):
2193
+ type: Literal["duration"] = "duration"
2194
+ seconds: int
2195
+
2196
+
2197
+ class TranscriptionResponse(OpenAIBaseModel):
2198
+ text: str
2199
+ """The transcribed text."""
2200
+ usage: TranscriptionUsageAudio
2201
+
2202
+
2203
+ class TranscriptionWord(OpenAIBaseModel):
2204
+ end: float
2205
+ """End time of the word in seconds."""
2206
+
2207
+ start: float
2208
+ """Start time of the word in seconds."""
2209
+
2210
+ word: str
2211
+ """The text content of the word."""
2212
+
2213
+
2214
+ class TranscriptionSegment(OpenAIBaseModel):
2215
+ id: int
2216
+ """Unique identifier of the segment."""
2217
+
2218
+ avg_logprob: float | None = None
2219
+ """Average logprob of the segment.
2220
+
2221
+ If the value is lower than -1, consider the logprobs failed.
2222
+ """
2223
+
2224
+ compression_ratio: float | None = None
2225
+ """Compression ratio of the segment.
2226
+
2227
+ If the value is greater than 2.4, consider the compression failed.
2228
+ """
2229
+
2230
+ end: float
2231
+ """End time of the segment in seconds."""
2232
+
2233
+ no_speech_prob: float | None = None
2234
+ """Probability of no speech in the segment.
2235
+
2236
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2237
+ this segment silent.
2238
+ """
2239
+
2240
+ seek: int
2241
+ """Seek offset of the segment."""
2242
+
2243
+ start: float
2244
+ """Start time of the segment in seconds."""
2245
+
2246
+ temperature: float
2247
+ """Temperature parameter used for generating the segment."""
2248
+
2249
+ text: str
2250
+ """Text content of the segment."""
2251
+
2252
+ tokens: list[int]
2253
+ """Array of token IDs for the text content."""
2254
+
2255
+
2256
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
2257
+ duration: str
2258
+ """The duration of the input audio."""
2259
+
2260
+ language: str
2261
+ """The language of the input audio."""
2262
+
2263
+ text: str
2264
+ """The transcribed text."""
2265
+
2266
+ segments: list[TranscriptionSegment] | None = None
2267
+ """Segments of the transcribed text and their corresponding details."""
2268
+
2269
+ words: list[TranscriptionWord] | None = None
2270
+ """Extracted words and their corresponding timestamps."""
2271
+
2272
+
2273
+ TranscriptionResponseVariant: TypeAlias = (
2274
+ TranscriptionResponse | TranscriptionResponseVerbose
2275
+ )
2276
+
2277
+
2278
+ class TranslationResponseStreamChoice(OpenAIBaseModel):
2279
+ delta: DeltaMessage
2280
+ finish_reason: str | None = None
2281
+ stop_reason: int | str | None = None
2282
+
2283
+
2284
+ class TranslationStreamResponse(OpenAIBaseModel):
2285
+ id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
2286
+ object: Literal["translation.chunk"] = "translation.chunk"
2287
+ created: int = Field(default_factory=lambda: int(time.time()))
2288
+ model: str
2289
+ choices: list[TranslationResponseStreamChoice]
2290
+ usage: UsageInfo | None = Field(default=None)
2291
+
2292
+
2293
+ class TranslationRequest(OpenAIBaseModel):
2294
+ # Ordered by official OpenAI API documentation
2295
+ # https://platform.openai.com/docs/api-reference/audio/createTranslation
2296
+
2297
+ file: UploadFile
2298
+ """
2299
+ The audio file object (not file name) to translate, in one of these
2300
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
2301
+ """
2302
+
2303
+ model: str | None = None
2304
+ """ID of the model to use.
2305
+ """
2306
+
2307
+ prompt: str = Field(default="")
2308
+ """An optional text to guide the model's style or continue a previous audio
2309
+ segment.
2310
+
2311
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
2312
+ should match the audio language.
2313
+ """
2314
+
2315
+ response_format: AudioResponseFormat = Field(default="json")
2316
+ """
2317
+ The format of the output, in one of these options: `json`, `text`, `srt`,
2318
+ `verbose_json`, or `vtt`.
2319
+ """
2320
+
2321
+ # TODO support additional sampling parameters
2322
+ # --8<-- [start:translation-sampling-params]
2323
+ seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
2324
+ """The seed to use for sampling."""
2325
+
2326
+ temperature: float = Field(default=0.0)
2327
+ """The sampling temperature, between 0 and 1.
2328
+
2329
+ Higher values like 0.8 will make the output more random, while lower values
2330
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
2331
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
2332
+ to automatically increase the temperature until certain thresholds are hit.
2333
+ """
2334
+ # --8<-- [end:translation-sampling-params]
2335
+
2336
+ # --8<-- [start:translation-extra-params]
2337
+ language: str | None = None
2338
+ """The language of the input audio we translate from.
2339
+
2340
+ Supplying the input language in
2341
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
2342
+ will improve accuracy.
2343
+ """
2344
+
2345
+ to_language: str | None = None
2346
+ """The language of the input audio we translate to.
2347
+
2348
+ Please note that this is not supported by all models, refer to the specific
2349
+ model documentation for more details.
2350
+ For instance, Whisper only supports `to_language=en`.
2351
+ """
2352
+
2353
+ stream: bool | None = False
2354
+ """Custom field not present in the original OpenAI definition. When set,
2355
+ it will enable output to be streamed in a similar fashion as the Chat
2356
+ Completion endpoint.
2357
+ """
2358
+ # Flattened stream option to simplify form data.
2359
+ stream_include_usage: bool | None = False
2360
+ stream_continuous_usage_stats: bool | None = False
2361
+
2362
+ max_completion_tokens: int | None = None
2363
+ """The maximum number of tokens to generate."""
2364
+ # --8<-- [end:translation-extra-params]
2365
+
2366
+ # Default sampling parameters for translation requests.
2367
+ _DEFAULT_SAMPLING_PARAMS: dict = {
2368
+ "temperature": 0,
2369
+ }
2370
+
2371
+ def to_sampling_params(
2372
+ self, default_max_tokens: int, default_sampling_params: dict | None = None
2373
+ ) -> SamplingParams:
2374
+ max_tokens = default_max_tokens
2375
+
2376
+ if default_sampling_params is None:
2377
+ default_sampling_params = {}
2378
+ # Default parameters
2379
+ if (temperature := self.temperature) is None:
2380
+ temperature = default_sampling_params.get(
2381
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
2382
+ )
2383
+
2384
+ return SamplingParams.from_optional(
2385
+ temperature=temperature,
2386
+ max_tokens=max_tokens,
2387
+ seed=self.seed,
2388
+ output_kind=RequestOutputKind.DELTA
2389
+ if self.stream
2390
+ else RequestOutputKind.FINAL_ONLY,
2391
+ skip_clone=True, # Created fresh per request, safe to skip clone
2392
+ )
2393
+
2394
+ @model_validator(mode="before")
2395
+ @classmethod
2396
+ def validate_stream_options(cls, data):
2397
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
2398
+ stream = data.get("stream", False)
2399
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
2400
+ # Find which specific stream option was set
2401
+ invalid_param = next(
2402
+ (so for so in stream_opts if data.get(so, False)),
2403
+ "stream_include_usage",
2404
+ )
2405
+ raise VLLMValidationError(
2406
+ "Stream options can only be defined when `stream=True`.",
2407
+ parameter=invalid_param,
2408
+ )
2409
+
2410
+ return data
2411
+
2412
+
2413
+ # Translation response objects
2414
+ class TranslationResponse(OpenAIBaseModel):
2415
+ text: str
2416
+ """The translated text."""
2417
+
2418
+
2419
+ class TranslationWord(OpenAIBaseModel):
2420
+ end: float
2421
+ """End time of the word in seconds."""
2422
+
2423
+ start: float
2424
+ """Start time of the word in seconds."""
2425
+
2426
+ word: str
2427
+ """The text content of the word."""
2428
+
2429
+
2430
+ class TranslationSegment(OpenAIBaseModel):
2431
+ id: int
2432
+ """Unique identifier of the segment."""
2433
+
2434
+ avg_logprob: float | None = None
2435
+ """Average logprob of the segment.
2436
+
2437
+ If the value is lower than -1, consider the logprobs failed.
2438
+ """
2439
+
2440
+ compression_ratio: float | None = None
2441
+ """Compression ratio of the segment.
2442
+
2443
+ If the value is greater than 2.4, consider the compression failed.
2444
+ """
2445
+
2446
+ end: float
2447
+ """End time of the segment in seconds."""
2448
+
2449
+ no_speech_prob: float | None = None
2450
+ """Probability of no speech in the segment.
2451
+
2452
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
2453
+ this segment silent.
2454
+ """
2455
+
2456
+ seek: int
2457
+ """Seek offset of the segment."""
2458
+
2459
+ start: float
2460
+ """Start time of the segment in seconds."""
2461
+
2462
+ temperature: float
2463
+ """Temperature parameter used for generating the segment."""
2464
+
2465
+ text: str
2466
+ """Text content of the segment."""
2467
+
2468
+ tokens: list[int]
2469
+ """Array of token IDs for the text content."""
2470
+
2471
+
2472
+ class TranslationResponseVerbose(OpenAIBaseModel):
2473
+ duration: str
2474
+ """The duration of the input audio."""
2475
+
2476
+ language: str
2477
+ """The language of the input audio."""
2478
+
2479
+ text: str
2480
+ """The translated text."""
2481
+
2482
+ segments: list[TranslationSegment] | None = None
2483
+ """Segments of the translated text and their corresponding details."""
2484
+
2485
+ words: list[TranslationWord] | None = None
2486
+ """Extracted words and their corresponding timestamps."""
2487
+
2488
+
2489
+ TranslationResponseVariant: TypeAlias = TranslationResponse | TranslationResponseVerbose
2490
+
2491
+
2492
+ ####### Tokens IN <> Tokens OUT #######
2493
+ class GenerateRequest(BaseModel):
2494
+ request_id: str = Field(
2495
+ default_factory=random_uuid,
2496
+ description=(
2497
+ "The request_id related to this request. If the caller does "
2498
+ "not set it, a random_uuid will be generated. This id is used "
2499
+ "through out the inference process and return in response."
2500
+ ),
2501
+ )
2502
+ token_ids: list[int]
2503
+ """The token ids to generate text from."""
2504
+
2505
+ # features: MultiModalFeatureSpec
2506
+ # TODO (NickLucche): implement once Renderer work is completed
2507
+ features: str | None = None
2508
+ """The processed MM inputs for the model."""
2509
+
2510
+ sampling_params: SamplingParams
2511
+ """The sampling parameters for the model."""
2512
+
2513
+ model: str | None = None
2514
+
2515
+ stream: bool | None = False
2516
+ stream_options: StreamOptions | None = None
2517
+ cache_salt: str | None = Field(
2518
+ default=None,
2519
+ description=(
2520
+ "If specified, the prefix cache will be salted with the provided "
2521
+ "string to prevent an attacker to guess prompts in multi-user "
2522
+ "environments. The salt should be random, protected from "
2523
+ "access by 3rd parties, and long enough to be "
2524
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
2525
+ "to 256 bit)."
2526
+ ),
2527
+ )
2528
+ priority: int = Field(
2529
+ default=0,
2530
+ description=(
2531
+ "The priority of the request (lower means earlier handling; "
2532
+ "default: 0). Any priority other than 0 will raise an error "
2533
+ "if the served model does not use priority scheduling."
2534
+ ),
2535
+ )
2536
+ kv_transfer_params: dict[str, Any] | None = Field(
2537
+ default=None,
2538
+ description="KVTransfer parameters used for disaggregated serving.",
2539
+ )
2540
+
2541
+
2542
+ class GenerateResponseChoice(BaseModel):
2543
+ index: int
2544
+ logprobs: ChatCompletionLogProbs | None = None
2545
+ # per OpenAI spec this is the default
2546
+ finish_reason: str | None = "stop"
2547
+ token_ids: list[int] | None = None
2548
+
2549
+
2550
+ class GenerateResponse(BaseModel):
2551
+ request_id: str = Field(
2552
+ default_factory=random_uuid,
2553
+ description=(
2554
+ "The request_id related to this request. If the caller does "
2555
+ "not set it, a random_uuid will be generated. This id is used "
2556
+ "through out the inference process and return in response."
2557
+ ),
2558
+ )
2559
+ choices: list[GenerateResponseChoice]
2560
+
2561
+ prompt_logprobs: list[dict[int, Logprob] | None] | None = None
2562
+
2563
+ kv_transfer_params: dict[str, Any] | None = Field(
2564
+ default=None,
2565
+ description="KVTransfer parameters used for disaggregated serving.",
2566
+ )