vllm-cpu-avx512bf16 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1175) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1742 -0
  4. vllm/_ipex_ops.py +243 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +15 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +44 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +33 -0
  16. vllm/assets/video.py +114 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +305 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1494 -0
  23. vllm/attention/backends/flash_attn.py +999 -0
  24. vllm/attention/backends/flashinfer.py +1100 -0
  25. vllm/attention/backends/flashmla.py +242 -0
  26. vllm/attention/backends/hpu_attn.py +309 -0
  27. vllm/attention/backends/ipex_attn.py +394 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1381 -0
  30. vllm/attention/backends/pallas.py +347 -0
  31. vllm/attention/backends/placeholder_attn.py +399 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +970 -0
  34. vllm/attention/backends/torch_sdpa.py +691 -0
  35. vllm/attention/backends/triton_mla.py +113 -0
  36. vllm/attention/backends/utils.py +609 -0
  37. vllm/attention/backends/xformers.py +798 -0
  38. vllm/attention/layer.py +452 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +245 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +367 -0
  45. vllm/attention/ops/flashmla.py +115 -0
  46. vllm/attention/ops/hpu_paged_attn.py +87 -0
  47. vllm/attention/ops/ipex_attn.py +194 -0
  48. vllm/attention/ops/merge_attn_states.py +42 -0
  49. vllm/attention/ops/nki_flash_attn.py +905 -0
  50. vllm/attention/ops/paged_attn.py +255 -0
  51. vllm/attention/ops/prefix_prefill.py +901 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +99 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  54. vllm/attention/ops/triton_decode_attention.py +673 -0
  55. vllm/attention/ops/triton_flash_attention.py +1374 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  57. vllm/attention/ops/triton_unified_attention.py +337 -0
  58. vllm/attention/selector.py +186 -0
  59. vllm/attention/utils/fa_utils.py +54 -0
  60. vllm/beam_search.py +82 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +921 -0
  63. vllm/benchmarks/endpoint_request_func.py +160 -0
  64. vllm/benchmarks/latency.py +184 -0
  65. vllm/benchmarks/serve.py +925 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +69 -0
  68. vllm/collect_env.py +818 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +88 -0
  71. vllm/compilation/backends.py +560 -0
  72. vllm/compilation/base_piecewise_backend.py +71 -0
  73. vllm/compilation/collective_fusion.py +126 -0
  74. vllm/compilation/compiler_interface.py +533 -0
  75. vllm/compilation/counter.py +33 -0
  76. vllm/compilation/cuda_piecewise_backend.py +213 -0
  77. vllm/compilation/decorators.py +249 -0
  78. vllm/compilation/fix_functionalization.py +190 -0
  79. vllm/compilation/fusion.py +617 -0
  80. vllm/compilation/fx_utils.py +61 -0
  81. vllm/compilation/inductor_pass.py +114 -0
  82. vllm/compilation/monitor.py +38 -0
  83. vllm/compilation/multi_output_match.py +108 -0
  84. vllm/compilation/noop_elimination.py +136 -0
  85. vllm/compilation/pass_manager.py +77 -0
  86. vllm/compilation/sequence_parallelism.py +267 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  88. vllm/compilation/vllm_inductor_pass.py +66 -0
  89. vllm/compilation/wrapper.py +129 -0
  90. vllm/config.py +4600 -0
  91. vllm/connections.py +173 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +398 -0
  95. vllm/core/block/common.py +370 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  97. vllm/core/block/interfaces.py +318 -0
  98. vllm/core/block/naive_block.py +465 -0
  99. vllm/core/block/prefix_caching_block.py +1134 -0
  100. vllm/core/block/utils.py +27 -0
  101. vllm/core/block_manager.py +520 -0
  102. vllm/core/evictor.py +156 -0
  103. vllm/core/interfaces.py +134 -0
  104. vllm/core/placeholder_block_space_manager.py +99 -0
  105. vllm/core/scheduler.py +2092 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +280 -0
  108. vllm/distributed/__init__.py +5 -0
  109. vllm/distributed/communication_op.py +40 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +126 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +144 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +167 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +303 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +258 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  120. vllm/distributed/device_communicators/pynccl.py +217 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +541 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  125. vllm/distributed/kv_events.py +296 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +11 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +126 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +202 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +91 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +5 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +259 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +133 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +189 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +851 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  152. vllm/distributed/parallel_state.py +1294 -0
  153. vllm/distributed/utils.py +520 -0
  154. vllm/engine/__init__.py +0 -0
  155. vllm/engine/arg_utils.py +1649 -0
  156. vllm/engine/async_llm_engine.py +1274 -0
  157. vllm/engine/async_timeout.py +191 -0
  158. vllm/engine/llm_engine.py +2153 -0
  159. vllm/engine/metrics.py +717 -0
  160. vllm/engine/metrics_types.py +96 -0
  161. vllm/engine/multiprocessing/__init__.py +188 -0
  162. vllm/engine/multiprocessing/client.py +755 -0
  163. vllm/engine/multiprocessing/engine.py +459 -0
  164. vllm/engine/output_processor/__init__.py +0 -0
  165. vllm/engine/output_processor/interfaces.py +74 -0
  166. vllm/engine/output_processor/multi_step.py +215 -0
  167. vllm/engine/output_processor/single_step.py +144 -0
  168. vllm/engine/output_processor/stop_checker.py +130 -0
  169. vllm/engine/output_processor/util.py +27 -0
  170. vllm/engine/protocol.py +310 -0
  171. vllm/entrypoints/__init__.py +0 -0
  172. vllm/entrypoints/api_server.py +177 -0
  173. vllm/entrypoints/chat_utils.py +1298 -0
  174. vllm/entrypoints/cli/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/base.py +38 -0
  177. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  178. vllm/entrypoints/cli/benchmark/main.py +53 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  180. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  181. vllm/entrypoints/cli/collect_env.py +34 -0
  182. vllm/entrypoints/cli/main.py +62 -0
  183. vllm/entrypoints/cli/openai.py +204 -0
  184. vllm/entrypoints/cli/serve.py +141 -0
  185. vllm/entrypoints/cli/types.py +24 -0
  186. vllm/entrypoints/launcher.py +146 -0
  187. vllm/entrypoints/llm.py +1503 -0
  188. vllm/entrypoints/logger.py +49 -0
  189. vllm/entrypoints/openai/__init__.py +0 -0
  190. vllm/entrypoints/openai/api_server.py +1376 -0
  191. vllm/entrypoints/openai/cli_args.py +306 -0
  192. vllm/entrypoints/openai/logits_processors.py +89 -0
  193. vllm/entrypoints/openai/protocol.py +1890 -0
  194. vllm/entrypoints/openai/run_batch.py +439 -0
  195. vllm/entrypoints/openai/serving_chat.py +1192 -0
  196. vllm/entrypoints/openai/serving_classification.py +159 -0
  197. vllm/entrypoints/openai/serving_completion.py +590 -0
  198. vllm/entrypoints/openai/serving_embedding.py +200 -0
  199. vllm/entrypoints/openai/serving_engine.py +985 -0
  200. vllm/entrypoints/openai/serving_models.py +314 -0
  201. vllm/entrypoints/openai/serving_pooling.py +231 -0
  202. vllm/entrypoints/openai/serving_score.py +432 -0
  203. vllm/entrypoints/openai/serving_tokenization.py +151 -0
  204. vllm/entrypoints/openai/serving_transcription.py +421 -0
  205. vllm/entrypoints/openai/tool_parsers/__init__.py +22 -0
  206. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  207. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +369 -0
  208. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +258 -0
  209. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +236 -0
  210. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  211. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +215 -0
  212. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +307 -0
  213. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +302 -0
  214. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +266 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  216. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +111 -0
  217. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +296 -0
  218. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  219. vllm/entrypoints/score_utils.py +49 -0
  220. vllm/entrypoints/ssl.py +74 -0
  221. vllm/entrypoints/utils.py +219 -0
  222. vllm/env_override.py +34 -0
  223. vllm/envs.py +896 -0
  224. vllm/executor/__init__.py +0 -0
  225. vllm/executor/executor_base.py +400 -0
  226. vllm/executor/mp_distributed_executor.py +243 -0
  227. vllm/executor/msgspec_utils.py +29 -0
  228. vllm/executor/multiproc_worker_utils.py +312 -0
  229. vllm/executor/ray_distributed_executor.py +700 -0
  230. vllm/executor/ray_utils.py +398 -0
  231. vllm/executor/uniproc_executor.py +138 -0
  232. vllm/forward_context.py +147 -0
  233. vllm/inputs/__init__.py +40 -0
  234. vllm/inputs/data.py +330 -0
  235. vllm/inputs/parse.py +150 -0
  236. vllm/inputs/preprocess.py +908 -0
  237. vllm/inputs/registry.py +214 -0
  238. vllm/jsontree.py +79 -0
  239. vllm/logger.py +211 -0
  240. vllm/logging_utils/__init__.py +7 -0
  241. vllm/logging_utils/dump_input.py +84 -0
  242. vllm/logging_utils/formatter.py +17 -0
  243. vllm/logits_process.py +118 -0
  244. vllm/lora/__init__.py +0 -0
  245. vllm/lora/fully_sharded_layers.py +354 -0
  246. vllm/lora/layers.py +1284 -0
  247. vllm/lora/lora.py +198 -0
  248. vllm/lora/models.py +817 -0
  249. vllm/lora/ops/__init__.py +0 -0
  250. vllm/lora/ops/torch_ops/__init__.py +15 -0
  251. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  252. vllm/lora/ops/triton_ops/__init__.py +11 -0
  253. vllm/lora/ops/triton_ops/kernel_utils.py +242 -0
  254. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  255. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  256. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  257. vllm/lora/ops/triton_ops/utils.py +119 -0
  258. vllm/lora/ops/xla_ops/__init__.py +6 -0
  259. vllm/lora/ops/xla_ops/lora_ops.py +106 -0
  260. vllm/lora/ops/xla_ops/pallas.py +133 -0
  261. vllm/lora/peft_helper.py +135 -0
  262. vllm/lora/punica_wrapper/__init__.py +9 -0
  263. vllm/lora/punica_wrapper/punica_base.py +484 -0
  264. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  265. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  266. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  267. vllm/lora/punica_wrapper/punica_selector.py +19 -0
  268. vllm/lora/punica_wrapper/punica_tpu.py +325 -0
  269. vllm/lora/punica_wrapper/utils.py +163 -0
  270. vllm/lora/request.py +98 -0
  271. vllm/lora/resolver.py +84 -0
  272. vllm/lora/utils.py +239 -0
  273. vllm/lora/worker_manager.py +253 -0
  274. vllm/model_executor/__init__.py +15 -0
  275. vllm/model_executor/custom_op.py +151 -0
  276. vllm/model_executor/guided_decoding/__init__.py +180 -0
  277. vllm/model_executor/guided_decoding/guidance_decoding.py +62 -0
  278. vllm/model_executor/guided_decoding/guidance_logits_processors.py +103 -0
  279. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  280. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  281. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  282. vllm/model_executor/guided_decoding/outlines_logits_processors.py +283 -0
  283. vllm/model_executor/guided_decoding/utils.py +241 -0
  284. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  285. vllm/model_executor/layers/__init__.py +0 -0
  286. vllm/model_executor/layers/activation.py +368 -0
  287. vllm/model_executor/layers/fused_moe/__init__.py +53 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  449. vllm/model_executor/layers/fused_moe/cutlass_moe.py +382 -0
  450. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +227 -0
  451. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +755 -0
  452. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +231 -0
  453. vllm/model_executor/layers/fused_moe/fused_moe.py +1722 -0
  454. vllm/model_executor/layers/fused_moe/layer.py +1366 -0
  455. vllm/model_executor/layers/fused_moe/modular_kernel.py +364 -0
  456. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +242 -0
  457. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  458. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +188 -0
  459. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  460. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +146 -0
  461. vllm/model_executor/layers/fused_moe/prepare_finalize.py +60 -0
  462. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +372 -0
  463. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +112 -0
  464. vllm/model_executor/layers/fused_moe/utils.py +97 -0
  465. vllm/model_executor/layers/layernorm.py +287 -0
  466. vllm/model_executor/layers/lightning_attn.py +651 -0
  467. vllm/model_executor/layers/linear.py +1523 -0
  468. vllm/model_executor/layers/logits_processor.py +196 -0
  469. vllm/model_executor/layers/mamba/__init__.py +0 -0
  470. vllm/model_executor/layers/mamba/mamba2_metadata.py +124 -0
  471. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  472. vllm/model_executor/layers/mamba/mamba_mixer2.py +615 -0
  473. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  474. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  475. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +413 -0
  476. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  477. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  478. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  479. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  480. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  481. vllm/model_executor/layers/pooler.py +343 -0
  482. vllm/model_executor/layers/quantization/__init__.py +156 -0
  483. vllm/model_executor/layers/quantization/aqlm.py +375 -0
  484. vllm/model_executor/layers/quantization/auto_round.py +308 -0
  485. vllm/model_executor/layers/quantization/awq.py +185 -0
  486. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  487. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  488. vllm/model_executor/layers/quantization/base_config.py +150 -0
  489. vllm/model_executor/layers/quantization/bitblas.py +460 -0
  490. vllm/model_executor/layers/quantization/bitsandbytes.py +397 -0
  491. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  492. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +644 -0
  493. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1252 -0
  494. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +21 -0
  495. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  496. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  497. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  498. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +92 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +120 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +214 -0
  505. vllm/model_executor/layers/quantization/deepspeedfp.py +194 -0
  506. vllm/model_executor/layers/quantization/experts_int8.py +195 -0
  507. vllm/model_executor/layers/quantization/fbgemm_fp8.py +171 -0
  508. vllm/model_executor/layers/quantization/fp8.py +876 -0
  509. vllm/model_executor/layers/quantization/gguf.py +564 -0
  510. vllm/model_executor/layers/quantization/gptq.py +277 -0
  511. vllm/model_executor/layers/quantization/gptq_bitblas.py +444 -0
  512. vllm/model_executor/layers/quantization/gptq_marlin.py +647 -0
  513. vllm/model_executor/layers/quantization/gptq_marlin_24.py +296 -0
  514. vllm/model_executor/layers/quantization/hqq_marlin.py +331 -0
  515. vllm/model_executor/layers/quantization/ipex_quant.py +249 -0
  516. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  517. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  518. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  519. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  520. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  521. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  522. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  523. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +130 -0
  524. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  525. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  526. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  527. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  528. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  529. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  530. vllm/model_executor/layers/quantization/kv_cache.py +138 -0
  531. vllm/model_executor/layers/quantization/marlin.py +260 -0
  532. vllm/model_executor/layers/quantization/modelopt.py +734 -0
  533. vllm/model_executor/layers/quantization/moe_wna16.py +448 -0
  534. vllm/model_executor/layers/quantization/neuron_quant.py +68 -0
  535. vllm/model_executor/layers/quantization/ptpc_fp8.py +126 -0
  536. vllm/model_executor/layers/quantization/qqq.py +274 -0
  537. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  538. vllm/model_executor/layers/quantization/quark/quark.py +440 -0
  539. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  540. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +8 -0
  541. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  542. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +125 -0
  543. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +145 -0
  544. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  545. vllm/model_executor/layers/quantization/quark/utils.py +104 -0
  546. vllm/model_executor/layers/quantization/schema.py +85 -0
  547. vllm/model_executor/layers/quantization/torchao.py +143 -0
  548. vllm/model_executor/layers/quantization/tpu_int8.py +120 -0
  549. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  550. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  551. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +207 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  754. vllm/model_executor/layers/quantization/utils/fp8_utils.py +611 -0
  755. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  756. vllm/model_executor/layers/quantization/utils/int8_utils.py +484 -0
  757. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  758. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  759. vllm/model_executor/layers/quantization/utils/marlin_utils.py +475 -0
  760. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +277 -0
  761. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +324 -0
  762. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  763. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +463 -0
  764. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +125 -0
  765. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +44 -0
  766. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +61 -0
  767. vllm/model_executor/layers/quantization/utils/quant_utils.py +572 -0
  768. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  769. vllm/model_executor/layers/rejection_sampler.py +405 -0
  770. vllm/model_executor/layers/resampler.py +269 -0
  771. vllm/model_executor/layers/rotary_embedding.py +1861 -0
  772. vllm/model_executor/layers/sampler.py +1203 -0
  773. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  774. vllm/model_executor/layers/typical_acceptance_sampler.py +165 -0
  775. vllm/model_executor/layers/utils.py +99 -0
  776. vllm/model_executor/layers/vocab_parallel_embedding.py +486 -0
  777. vllm/model_executor/model_loader/__init__.py +75 -0
  778. vllm/model_executor/model_loader/base_loader.py +24 -0
  779. vllm/model_executor/model_loader/bitsandbytes_loader.py +582 -0
  780. vllm/model_executor/model_loader/default_loader.py +295 -0
  781. vllm/model_executor/model_loader/dummy_loader.py +37 -0
  782. vllm/model_executor/model_loader/gguf_loader.py +113 -0
  783. vllm/model_executor/model_loader/neuron.py +475 -0
  784. vllm/model_executor/model_loader/neuronx_distributed.py +622 -0
  785. vllm/model_executor/model_loader/runai_streamer_loader.py +120 -0
  786. vllm/model_executor/model_loader/sharded_state_loader.py +211 -0
  787. vllm/model_executor/model_loader/tensorizer.py +632 -0
  788. vllm/model_executor/model_loader/tensorizer_loader.py +122 -0
  789. vllm/model_executor/model_loader/utils.py +301 -0
  790. vllm/model_executor/model_loader/weight_utils.py +781 -0
  791. vllm/model_executor/models/__init__.py +27 -0
  792. vllm/model_executor/models/adapters.py +247 -0
  793. vllm/model_executor/models/aimv2.py +199 -0
  794. vllm/model_executor/models/arctic.py +558 -0
  795. vllm/model_executor/models/aria.py +656 -0
  796. vllm/model_executor/models/aya_vision.py +461 -0
  797. vllm/model_executor/models/baichuan.py +473 -0
  798. vllm/model_executor/models/bamba.py +542 -0
  799. vllm/model_executor/models/bart.py +937 -0
  800. vllm/model_executor/models/bert.py +517 -0
  801. vllm/model_executor/models/bert_with_rope.py +714 -0
  802. vllm/model_executor/models/blip.py +338 -0
  803. vllm/model_executor/models/blip2.py +717 -0
  804. vllm/model_executor/models/bloom.py +372 -0
  805. vllm/model_executor/models/chameleon.py +1135 -0
  806. vllm/model_executor/models/chatglm.py +477 -0
  807. vllm/model_executor/models/clip.py +411 -0
  808. vllm/model_executor/models/commandr.py +471 -0
  809. vllm/model_executor/models/constant_size_cache.py +136 -0
  810. vllm/model_executor/models/dbrx.py +471 -0
  811. vllm/model_executor/models/deepseek.py +485 -0
  812. vllm/model_executor/models/deepseek_mtp.py +268 -0
  813. vllm/model_executor/models/deepseek_v2.py +842 -0
  814. vllm/model_executor/models/deepseek_vl2.py +647 -0
  815. vllm/model_executor/models/eagle.py +259 -0
  816. vllm/model_executor/models/exaone.py +550 -0
  817. vllm/model_executor/models/fairseq2_llama.py +153 -0
  818. vllm/model_executor/models/falcon.py +509 -0
  819. vllm/model_executor/models/falcon_h1.py +684 -0
  820. vllm/model_executor/models/florence2.py +1102 -0
  821. vllm/model_executor/models/fuyu.py +388 -0
  822. vllm/model_executor/models/gemma.py +424 -0
  823. vllm/model_executor/models/gemma2.py +424 -0
  824. vllm/model_executor/models/gemma3.py +532 -0
  825. vllm/model_executor/models/gemma3_mm.py +708 -0
  826. vllm/model_executor/models/glm.py +22 -0
  827. vllm/model_executor/models/glm4.py +304 -0
  828. vllm/model_executor/models/glm4v.py +647 -0
  829. vllm/model_executor/models/gpt2.py +327 -0
  830. vllm/model_executor/models/gpt_bigcode.py +334 -0
  831. vllm/model_executor/models/gpt_j.py +338 -0
  832. vllm/model_executor/models/gpt_neox.py +331 -0
  833. vllm/model_executor/models/granite.py +492 -0
  834. vllm/model_executor/models/granite_speech.py +778 -0
  835. vllm/model_executor/models/granitemoe.py +436 -0
  836. vllm/model_executor/models/granitemoehybrid.py +585 -0
  837. vllm/model_executor/models/granitemoeshared.py +340 -0
  838. vllm/model_executor/models/gritlm.py +223 -0
  839. vllm/model_executor/models/grok1.py +545 -0
  840. vllm/model_executor/models/h2ovl.py +545 -0
  841. vllm/model_executor/models/idefics2_vision_model.py +388 -0
  842. vllm/model_executor/models/idefics3.py +767 -0
  843. vllm/model_executor/models/interfaces.py +571 -0
  844. vllm/model_executor/models/interfaces_base.py +163 -0
  845. vllm/model_executor/models/intern_vit.py +475 -0
  846. vllm/model_executor/models/internlm2.py +454 -0
  847. vllm/model_executor/models/internlm2_ve.py +146 -0
  848. vllm/model_executor/models/internvl.py +1405 -0
  849. vllm/model_executor/models/jais.py +372 -0
  850. vllm/model_executor/models/jamba.py +591 -0
  851. vllm/model_executor/models/kimi_vl.py +576 -0
  852. vllm/model_executor/models/llama.py +643 -0
  853. vllm/model_executor/models/llama4.py +531 -0
  854. vllm/model_executor/models/llama_eagle.py +166 -0
  855. vllm/model_executor/models/llama_eagle3.py +257 -0
  856. vllm/model_executor/models/llava.py +865 -0
  857. vllm/model_executor/models/llava_next.py +585 -0
  858. vllm/model_executor/models/llava_next_video.py +470 -0
  859. vllm/model_executor/models/llava_onevision.py +955 -0
  860. vllm/model_executor/models/mamba.py +272 -0
  861. vllm/model_executor/models/mamba2.py +302 -0
  862. vllm/model_executor/models/mamba_cache.py +75 -0
  863. vllm/model_executor/models/medusa.py +218 -0
  864. vllm/model_executor/models/mimo.py +191 -0
  865. vllm/model_executor/models/mimo_mtp.py +284 -0
  866. vllm/model_executor/models/minicpm.py +590 -0
  867. vllm/model_executor/models/minicpm3.py +229 -0
  868. vllm/model_executor/models/minicpmo.py +758 -0
  869. vllm/model_executor/models/minicpmv.py +1286 -0
  870. vllm/model_executor/models/minimax_cache.py +35 -0
  871. vllm/model_executor/models/minimax_text_01.py +1303 -0
  872. vllm/model_executor/models/minimax_vl_01.py +363 -0
  873. vllm/model_executor/models/mistral3.py +603 -0
  874. vllm/model_executor/models/mixtral.py +487 -0
  875. vllm/model_executor/models/mixtral_quant.py +452 -0
  876. vllm/model_executor/models/mllama.py +1623 -0
  877. vllm/model_executor/models/mllama4.py +838 -0
  878. vllm/model_executor/models/mlp_speculator.py +205 -0
  879. vllm/model_executor/models/modernbert.py +329 -0
  880. vllm/model_executor/models/module_mapping.py +71 -0
  881. vllm/model_executor/models/molmo.py +1567 -0
  882. vllm/model_executor/models/moonvit.py +629 -0
  883. vllm/model_executor/models/mpt.py +330 -0
  884. vllm/model_executor/models/nemotron.py +507 -0
  885. vllm/model_executor/models/nemotron_nas.py +483 -0
  886. vllm/model_executor/models/nvlm_d.py +215 -0
  887. vllm/model_executor/models/olmo.py +388 -0
  888. vllm/model_executor/models/olmo2.py +413 -0
  889. vllm/model_executor/models/olmoe.py +446 -0
  890. vllm/model_executor/models/opt.py +411 -0
  891. vllm/model_executor/models/orion.py +348 -0
  892. vllm/model_executor/models/ovis.py +554 -0
  893. vllm/model_executor/models/paligemma.py +397 -0
  894. vllm/model_executor/models/persimmon.py +343 -0
  895. vllm/model_executor/models/phi.py +355 -0
  896. vllm/model_executor/models/phi3.py +18 -0
  897. vllm/model_executor/models/phi3_small.py +464 -0
  898. vllm/model_executor/models/phi3v.py +722 -0
  899. vllm/model_executor/models/phi4mm.py +1245 -0
  900. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  901. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  902. vllm/model_executor/models/phimoe.py +664 -0
  903. vllm/model_executor/models/pixtral.py +1315 -0
  904. vllm/model_executor/models/plamo2.py +737 -0
  905. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  906. vllm/model_executor/models/qwen.py +361 -0
  907. vllm/model_executor/models/qwen2.py +567 -0
  908. vllm/model_executor/models/qwen2_5_omni_thinker.py +903 -0
  909. vllm/model_executor/models/qwen2_5_vl.py +1171 -0
  910. vllm/model_executor/models/qwen2_audio.py +409 -0
  911. vllm/model_executor/models/qwen2_moe.py +539 -0
  912. vllm/model_executor/models/qwen2_rm.py +131 -0
  913. vllm/model_executor/models/qwen2_vl.py +1410 -0
  914. vllm/model_executor/models/qwen3.py +320 -0
  915. vllm/model_executor/models/qwen3_moe.py +534 -0
  916. vllm/model_executor/models/qwen_vl.py +784 -0
  917. vllm/model_executor/models/registry.py +618 -0
  918. vllm/model_executor/models/roberta.py +273 -0
  919. vllm/model_executor/models/siglip.py +523 -0
  920. vllm/model_executor/models/skyworkr1v.py +950 -0
  921. vllm/model_executor/models/smolvlm.py +51 -0
  922. vllm/model_executor/models/solar.py +505 -0
  923. vllm/model_executor/models/stablelm.py +342 -0
  924. vllm/model_executor/models/starcoder2.py +355 -0
  925. vllm/model_executor/models/telechat2.py +139 -0
  926. vllm/model_executor/models/teleflm.py +78 -0
  927. vllm/model_executor/models/transformers.py +507 -0
  928. vllm/model_executor/models/ultravox.py +655 -0
  929. vllm/model_executor/models/utils.py +730 -0
  930. vllm/model_executor/models/vision.py +146 -0
  931. vllm/model_executor/models/whisper.py +746 -0
  932. vllm/model_executor/models/zamba2.py +1008 -0
  933. vllm/model_executor/parameter.py +458 -0
  934. vllm/model_executor/pooling_metadata.py +71 -0
  935. vllm/model_executor/sampling_metadata.py +596 -0
  936. vllm/model_executor/utils.py +53 -0
  937. vllm/multimodal/__init__.py +32 -0
  938. vllm/multimodal/audio.py +105 -0
  939. vllm/multimodal/base.py +218 -0
  940. vllm/multimodal/hasher.py +117 -0
  941. vllm/multimodal/image.py +96 -0
  942. vllm/multimodal/inputs.py +872 -0
  943. vllm/multimodal/parse.py +460 -0
  944. vllm/multimodal/processing.py +1894 -0
  945. vllm/multimodal/profiling.py +273 -0
  946. vllm/multimodal/registry.py +330 -0
  947. vllm/multimodal/utils.py +392 -0
  948. vllm/multimodal/video.py +197 -0
  949. vllm/outputs.py +525 -0
  950. vllm/platforms/__init__.py +290 -0
  951. vllm/platforms/cpu.py +205 -0
  952. vllm/platforms/cuda.py +461 -0
  953. vllm/platforms/hpu.py +105 -0
  954. vllm/platforms/interface.py +492 -0
  955. vllm/platforms/neuron.py +152 -0
  956. vllm/platforms/rocm.py +388 -0
  957. vllm/platforms/tpu.py +215 -0
  958. vllm/platforms/xpu.py +155 -0
  959. vllm/plugins/__init__.py +86 -0
  960. vllm/plugins/lora_resolvers/README.md +15 -0
  961. vllm/plugins/lora_resolvers/__init__.py +0 -0
  962. vllm/plugins/lora_resolvers/filesystem_resolver.py +49 -0
  963. vllm/pooling_params.py +53 -0
  964. vllm/profiler/__init__.py +0 -0
  965. vllm/profiler/layerwise_profile.py +374 -0
  966. vllm/profiler/utils.py +147 -0
  967. vllm/prompt_adapter/__init__.py +0 -0
  968. vllm/prompt_adapter/layers.py +82 -0
  969. vllm/prompt_adapter/models.py +357 -0
  970. vllm/prompt_adapter/request.py +36 -0
  971. vllm/prompt_adapter/utils.py +97 -0
  972. vllm/prompt_adapter/worker_manager.py +178 -0
  973. vllm/py.typed +2 -0
  974. vllm/reasoning/__init__.py +14 -0
  975. vllm/reasoning/abs_reasoning_parsers.py +191 -0
  976. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  977. vllm/reasoning/granite_reasoning_parser.py +362 -0
  978. vllm/reasoning/qwen3_reasoning_parser.py +150 -0
  979. vllm/sampling_params.py +590 -0
  980. vllm/scalar_type.py +346 -0
  981. vllm/scripts.py +14 -0
  982. vllm/sequence.py +1567 -0
  983. vllm/spec_decode/__init__.py +0 -0
  984. vllm/spec_decode/batch_expansion.py +505 -0
  985. vllm/spec_decode/draft_model_runner.py +349 -0
  986. vllm/spec_decode/interfaces.py +98 -0
  987. vllm/spec_decode/medusa_worker.py +137 -0
  988. vllm/spec_decode/metrics.py +212 -0
  989. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  990. vllm/spec_decode/mqa_scorer.py +159 -0
  991. vllm/spec_decode/multi_step_worker.py +422 -0
  992. vllm/spec_decode/ngram_worker.py +195 -0
  993. vllm/spec_decode/proposer_worker_base.py +58 -0
  994. vllm/spec_decode/smaller_tp_proposer_worker.py +195 -0
  995. vllm/spec_decode/spec_decode_worker.py +1325 -0
  996. vllm/spec_decode/target_model_runner.py +44 -0
  997. vllm/spec_decode/top1_proposer.py +274 -0
  998. vllm/spec_decode/util.py +276 -0
  999. vllm/test_utils.py +129 -0
  1000. vllm/third_party/__init__.py +0 -0
  1001. vllm/third_party/pynvml.py +6139 -0
  1002. vllm/tracing.py +130 -0
  1003. vllm/transformers_utils/__init__.py +23 -0
  1004. vllm/transformers_utils/chat_templates/__init__.py +4 -0
  1005. vllm/transformers_utils/chat_templates/registry.py +59 -0
  1006. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1007. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1008. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1009. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1010. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1011. vllm/transformers_utils/config.py +835 -0
  1012. vllm/transformers_utils/configs/__init__.py +58 -0
  1013. vllm/transformers_utils/configs/arctic.py +206 -0
  1014. vllm/transformers_utils/configs/chatglm.py +71 -0
  1015. vllm/transformers_utils/configs/cohere2.py +194 -0
  1016. vllm/transformers_utils/configs/dbrx.py +279 -0
  1017. vllm/transformers_utils/configs/deepseek_vl2.py +215 -0
  1018. vllm/transformers_utils/configs/eagle.py +84 -0
  1019. vllm/transformers_utils/configs/exaone.py +189 -0
  1020. vllm/transformers_utils/configs/falcon.py +89 -0
  1021. vllm/transformers_utils/configs/h2ovl.py +15 -0
  1022. vllm/transformers_utils/configs/internvl.py +53 -0
  1023. vllm/transformers_utils/configs/jais.py +237 -0
  1024. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  1025. vllm/transformers_utils/configs/medusa.py +62 -0
  1026. vllm/transformers_utils/configs/minimax_text_01.py +69 -0
  1027. vllm/transformers_utils/configs/minimax_vl_01.py +70 -0
  1028. vllm/transformers_utils/configs/mllama.py +30 -0
  1029. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  1030. vllm/transformers_utils/configs/moonvit.py +32 -0
  1031. vllm/transformers_utils/configs/mpt.py +179 -0
  1032. vllm/transformers_utils/configs/nemotron.py +204 -0
  1033. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  1034. vllm/transformers_utils/configs/ovis.py +183 -0
  1035. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  1036. vllm/transformers_utils/configs/solar.py +246 -0
  1037. vllm/transformers_utils/configs/telechat2.py +63 -0
  1038. vllm/transformers_utils/configs/ultravox.py +107 -0
  1039. vllm/transformers_utils/detokenizer.py +167 -0
  1040. vllm/transformers_utils/detokenizer_utils.py +188 -0
  1041. vllm/transformers_utils/processor.py +220 -0
  1042. vllm/transformers_utils/processors/__init__.py +7 -0
  1043. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1044. vllm/transformers_utils/processors/ovis.py +419 -0
  1045. vllm/transformers_utils/s3_utils.py +161 -0
  1046. vllm/transformers_utils/tokenizer.py +301 -0
  1047. vllm/transformers_utils/tokenizer_base.py +148 -0
  1048. vllm/transformers_utils/tokenizer_group.py +119 -0
  1049. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  1050. vllm/transformers_utils/tokenizers/mistral.py +490 -0
  1051. vllm/transformers_utils/utils.py +98 -0
  1052. vllm/triton_utils/__init__.py +13 -0
  1053. vllm/triton_utils/importing.py +49 -0
  1054. vllm/usage/__init__.py +0 -0
  1055. vllm/usage/usage_lib.py +255 -0
  1056. vllm/utils.py +2844 -0
  1057. vllm/v1/__init__.py +0 -0
  1058. vllm/v1/attention/__init__.py +0 -0
  1059. vllm/v1/attention/backends/__init__.py +0 -0
  1060. vllm/v1/attention/backends/flash_attn.py +833 -0
  1061. vllm/v1/attention/backends/flashinfer.py +639 -0
  1062. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1063. vllm/v1/attention/backends/mla/common.py +926 -0
  1064. vllm/v1/attention/backends/mla/flashmla.py +150 -0
  1065. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +221 -0
  1066. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1067. vllm/v1/attention/backends/pallas.py +235 -0
  1068. vllm/v1/attention/backends/triton_attn.py +279 -0
  1069. vllm/v1/attention/backends/utils.py +18 -0
  1070. vllm/v1/core/__init__.py +0 -0
  1071. vllm/v1/core/block_pool.py +328 -0
  1072. vllm/v1/core/encoder_cache_manager.py +149 -0
  1073. vllm/v1/core/kv_cache_manager.py +372 -0
  1074. vllm/v1/core/kv_cache_utils.py +748 -0
  1075. vllm/v1/core/sched/__init__.py +0 -0
  1076. vllm/v1/core/sched/interface.py +143 -0
  1077. vllm/v1/core/sched/output.py +153 -0
  1078. vllm/v1/core/sched/scheduler.py +1015 -0
  1079. vllm/v1/core/sched/utils.py +22 -0
  1080. vllm/v1/core/single_type_kv_cache_manager.py +358 -0
  1081. vllm/v1/engine/__init__.py +171 -0
  1082. vllm/v1/engine/async_llm.py +546 -0
  1083. vllm/v1/engine/core.py +801 -0
  1084. vllm/v1/engine/core_client.py +1020 -0
  1085. vllm/v1/engine/detokenizer.py +260 -0
  1086. vllm/v1/engine/exceptions.py +16 -0
  1087. vllm/v1/engine/llm_engine.py +316 -0
  1088. vllm/v1/engine/logprobs.py +198 -0
  1089. vllm/v1/engine/mm_input_cache.py +90 -0
  1090. vllm/v1/engine/output_processor.py +427 -0
  1091. vllm/v1/engine/parallel_sampling.py +132 -0
  1092. vllm/v1/engine/processor.py +398 -0
  1093. vllm/v1/executor/__init__.py +0 -0
  1094. vllm/v1/executor/abstract.py +112 -0
  1095. vllm/v1/executor/multiproc_executor.py +532 -0
  1096. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1097. vllm/v1/kv_cache_interface.py +208 -0
  1098. vllm/v1/metrics/__init__.py +0 -0
  1099. vllm/v1/metrics/loggers.py +511 -0
  1100. vllm/v1/metrics/ray_wrappers.py +120 -0
  1101. vllm/v1/metrics/reader.py +245 -0
  1102. vllm/v1/metrics/stats.py +238 -0
  1103. vllm/v1/outputs.py +115 -0
  1104. vllm/v1/request.py +191 -0
  1105. vllm/v1/sample/__init__.py +0 -0
  1106. vllm/v1/sample/metadata.py +43 -0
  1107. vllm/v1/sample/ops/__init__.py +0 -0
  1108. vllm/v1/sample/ops/bad_words.py +38 -0
  1109. vllm/v1/sample/ops/penalties.py +58 -0
  1110. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1111. vllm/v1/sample/rejection_sampler.py +630 -0
  1112. vllm/v1/sample/sampler.py +270 -0
  1113. vllm/v1/sample/tpu/__init__.py +0 -0
  1114. vllm/v1/sample/tpu/metadata.py +123 -0
  1115. vllm/v1/sample/tpu/sampler.py +144 -0
  1116. vllm/v1/serial_utils.py +313 -0
  1117. vllm/v1/spec_decode/__init__.py +0 -0
  1118. vllm/v1/spec_decode/eagle.py +424 -0
  1119. vllm/v1/spec_decode/medusa.py +61 -0
  1120. vllm/v1/spec_decode/metadata.py +61 -0
  1121. vllm/v1/spec_decode/metrics.py +177 -0
  1122. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1123. vllm/v1/spec_decode/utils.py +45 -0
  1124. vllm/v1/structured_output/__init__.py +215 -0
  1125. vllm/v1/structured_output/backend_guidance.py +244 -0
  1126. vllm/v1/structured_output/backend_types.py +133 -0
  1127. vllm/v1/structured_output/backend_xgrammar.py +317 -0
  1128. vllm/v1/structured_output/request.py +85 -0
  1129. vllm/v1/structured_output/utils.py +174 -0
  1130. vllm/v1/utils.py +294 -0
  1131. vllm/v1/worker/__init__.py +0 -0
  1132. vllm/v1/worker/block_table.py +139 -0
  1133. vllm/v1/worker/gpu_input_batch.py +680 -0
  1134. vllm/v1/worker/gpu_model_runner.py +2084 -0
  1135. vllm/v1/worker/gpu_worker.py +373 -0
  1136. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1137. vllm/v1/worker/tpu_model_runner.py +1510 -0
  1138. vllm/v1/worker/tpu_worker.py +276 -0
  1139. vllm/v1/worker/utils.py +74 -0
  1140. vllm/v1/worker/worker_base.py +64 -0
  1141. vllm/version.py +40 -0
  1142. vllm/vllm_flash_attn/.gitkeep +0 -0
  1143. vllm/worker/__init__.py +0 -0
  1144. vllm/worker/cache_engine.py +144 -0
  1145. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1146. vllm/worker/cpu_model_runner.py +671 -0
  1147. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1148. vllm/worker/cpu_worker.py +400 -0
  1149. vllm/worker/enc_dec_model_runner.py +555 -0
  1150. vllm/worker/hpu_model_runner.py +2319 -0
  1151. vllm/worker/hpu_worker.py +483 -0
  1152. vllm/worker/model_runner.py +2178 -0
  1153. vllm/worker/model_runner_base.py +281 -0
  1154. vllm/worker/multi_step_hpu_worker.py +122 -0
  1155. vllm/worker/multi_step_model_runner.py +910 -0
  1156. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1157. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1158. vllm/worker/multi_step_tpu_worker.py +107 -0
  1159. vllm/worker/multi_step_worker.py +196 -0
  1160. vllm/worker/neuron_model_runner.py +418 -0
  1161. vllm/worker/neuron_worker.py +158 -0
  1162. vllm/worker/neuronx_distributed_model_runner.py +136 -0
  1163. vllm/worker/pooling_model_runner.py +211 -0
  1164. vllm/worker/tpu_model_runner.py +908 -0
  1165. vllm/worker/tpu_worker.py +336 -0
  1166. vllm/worker/utils.py +52 -0
  1167. vllm/worker/worker.py +574 -0
  1168. vllm/worker/worker_base.py +644 -0
  1169. vllm/worker/xpu_model_runner.py +606 -0
  1170. vllm/worker/xpu_worker.py +185 -0
  1171. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/METADATA +335 -0
  1172. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/RECORD +1175 -0
  1173. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/WHEEL +5 -0
  1174. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/entry_points.txt +5 -0
  1175. vllm_cpu_avx512bf16-0.9.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1890 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from
4
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
5
+ import json
6
+ import time
7
+ from http import HTTPStatus
8
+ from typing import Annotated, Any, ClassVar, Literal, Optional, Union
9
+
10
+ import regex as re
11
+ import torch
12
+ from fastapi import HTTPException, UploadFile
13
+ from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
14
+ ValidationInfo, field_validator, model_validator)
15
+ from typing_extensions import TypeAlias
16
+
17
+ from vllm import envs
18
+ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
19
+ random_tool_call_id)
20
+ from vllm.logger import init_logger
21
+ from vllm.pooling_params import PoolingParams
22
+ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
23
+ RequestOutputKind, SamplingParams)
24
+ from vllm.sequence import Logprob
25
+ from vllm.utils import random_uuid, resolve_obj_by_qualname
26
+
27
+ logger = init_logger(__name__)
28
+
29
+ _LONG_INFO = torch.iinfo(torch.long)
30
+
31
+
32
+ class OpenAIBaseModel(BaseModel):
33
+ # OpenAI API does allow extra fields
34
+ model_config = ConfigDict(extra="allow")
35
+
36
+ # Cache class field names
37
+ field_names: ClassVar[Optional[set[str]]] = None
38
+
39
+ @model_validator(mode="wrap")
40
+ @classmethod
41
+ def __log_extra_fields__(cls, data, handler):
42
+ result = handler(data)
43
+ if not isinstance(data, dict):
44
+ return result
45
+ field_names = cls.field_names
46
+ if field_names is None:
47
+ # Get all class field names and their potential aliases
48
+ field_names = set()
49
+ for field_name, field in cls.model_fields.items():
50
+ field_names.add(field_name)
51
+ if alias := getattr(field, "alias", None):
52
+ field_names.add(alias)
53
+ cls.field_names = field_names
54
+
55
+ # Compare against both field names and aliases
56
+ if any(k not in field_names for k in data):
57
+ logger.warning(
58
+ "The following fields were present in the request "
59
+ "but ignored: %s",
60
+ data.keys() - field_names,
61
+ )
62
+ return result
63
+
64
+
65
+ class ErrorResponse(OpenAIBaseModel):
66
+ object: str = "error"
67
+ message: str
68
+ type: str
69
+ param: Optional[str] = None
70
+ code: int
71
+
72
+
73
+ class ModelPermission(OpenAIBaseModel):
74
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
75
+ object: str = "model_permission"
76
+ created: int = Field(default_factory=lambda: int(time.time()))
77
+ allow_create_engine: bool = False
78
+ allow_sampling: bool = True
79
+ allow_logprobs: bool = True
80
+ allow_search_indices: bool = False
81
+ allow_view: bool = True
82
+ allow_fine_tuning: bool = False
83
+ organization: str = "*"
84
+ group: Optional[str] = None
85
+ is_blocking: bool = False
86
+
87
+
88
+ class ModelCard(OpenAIBaseModel):
89
+ id: str
90
+ object: str = "model"
91
+ created: int = Field(default_factory=lambda: int(time.time()))
92
+ owned_by: str = "vllm"
93
+ root: Optional[str] = None
94
+ parent: Optional[str] = None
95
+ max_model_len: Optional[int] = None
96
+ permission: list[ModelPermission] = Field(default_factory=list)
97
+
98
+
99
+ class ModelList(OpenAIBaseModel):
100
+ object: str = "list"
101
+ data: list[ModelCard] = Field(default_factory=list)
102
+
103
+
104
+ class PromptTokenUsageInfo(OpenAIBaseModel):
105
+ cached_tokens: Optional[int] = None
106
+
107
+
108
+ class UsageInfo(OpenAIBaseModel):
109
+ prompt_tokens: int = 0
110
+ total_tokens: int = 0
111
+ completion_tokens: Optional[int] = 0
112
+ prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
113
+
114
+
115
+ class RequestResponseMetadata(BaseModel):
116
+ request_id: str
117
+ final_usage_info: Optional[UsageInfo] = None
118
+
119
+
120
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
121
+ name: str
122
+ description: Optional[str] = None
123
+ # schema is the field in openai but that causes conflicts with pydantic so
124
+ # instead use json_schema with an alias
125
+ json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
126
+ strict: Optional[bool] = None
127
+
128
+
129
+ class StructuralTag(OpenAIBaseModel):
130
+ begin: str
131
+ # schema is the field, but that causes conflicts with pydantic so
132
+ # instead use structural_tag_schema with an alias
133
+ structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
134
+ alias="schema")
135
+ end: str
136
+
137
+
138
+ class StructuralTagResponseFormat(OpenAIBaseModel):
139
+ type: Literal["structural_tag"]
140
+ structures: list[StructuralTag]
141
+ triggers: list[str]
142
+
143
+
144
+ class ResponseFormat(OpenAIBaseModel):
145
+ # type must be "json_schema", "json_object", or "text"
146
+ type: Literal["text", "json_object", "json_schema"]
147
+ json_schema: Optional[JsonSchemaResponseFormat] = None
148
+
149
+
150
+ AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
151
+
152
+
153
+ class StreamOptions(OpenAIBaseModel):
154
+ include_usage: Optional[bool] = True
155
+ continuous_usage_stats: Optional[bool] = False
156
+
157
+
158
+ class FunctionDefinition(OpenAIBaseModel):
159
+ name: str
160
+ description: Optional[str] = None
161
+ parameters: Optional[dict[str, Any]] = None
162
+
163
+
164
+ class ChatCompletionToolsParam(OpenAIBaseModel):
165
+ type: Literal["function"] = "function"
166
+ function: FunctionDefinition
167
+
168
+
169
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
170
+ name: str
171
+
172
+
173
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
174
+ function: ChatCompletionNamedFunction
175
+ type: Literal["function"] = "function"
176
+
177
+
178
+ class LogitsProcessorConstructor(BaseModel):
179
+ qualname: str
180
+ args: Optional[list[Any]] = None
181
+ kwargs: Optional[dict[str, Any]] = None
182
+
183
+
184
+ LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
185
+
186
+
187
+ def get_logits_processors(processors: Optional[LogitsProcessors],
188
+ pattern: Optional[str]) -> Optional[list[Any]]:
189
+ if processors and pattern:
190
+ logits_processors = []
191
+ for processor in processors:
192
+ qualname = processor if isinstance(processor,
193
+ str) else processor.qualname
194
+ if not re.match(pattern, qualname):
195
+ raise ValueError(
196
+ f"Logits processor '{qualname}' is not allowed by this "
197
+ "server. See --logits-processor-pattern engine argument "
198
+ "for more information.")
199
+ try:
200
+ logits_processor = resolve_obj_by_qualname(qualname)
201
+ except Exception as e:
202
+ raise ValueError(
203
+ f"Logits processor '{qualname}' could not be resolved: {e}"
204
+ ) from e
205
+ if isinstance(processor, LogitsProcessorConstructor):
206
+ logits_processor = logits_processor(*processor.args or [],
207
+ **processor.kwargs or {})
208
+ logits_processors.append(logits_processor)
209
+ return logits_processors
210
+ elif processors:
211
+ raise ValueError(
212
+ "The `logits_processors` argument is not supported by this "
213
+ "server. See --logits-processor-pattern engine argugment "
214
+ "for more information.")
215
+ return None
216
+
217
+
218
+ class ChatCompletionRequest(OpenAIBaseModel):
219
+ # Ordered by official OpenAI API documentation
220
+ # https://platform.openai.com/docs/api-reference/chat/create
221
+ messages: list[ChatCompletionMessageParam]
222
+ model: Optional[str] = None
223
+ frequency_penalty: Optional[float] = 0.0
224
+ logit_bias: Optional[dict[str, float]] = None
225
+ logprobs: Optional[bool] = False
226
+ top_logprobs: Optional[int] = 0
227
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
228
+ max_tokens: Optional[int] = Field(
229
+ default=None,
230
+ deprecated=
231
+ 'max_tokens is deprecated in favor of the max_completion_tokens field')
232
+ max_completion_tokens: Optional[int] = None
233
+ n: Optional[int] = 1
234
+ presence_penalty: Optional[float] = 0.0
235
+ response_format: Optional[AnyResponseFormat] = None
236
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
237
+ stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
238
+ stream: Optional[bool] = False
239
+ stream_options: Optional[StreamOptions] = None
240
+ temperature: Optional[float] = None
241
+ top_p: Optional[float] = None
242
+ tools: Optional[list[ChatCompletionToolsParam]] = None
243
+ tool_choice: Optional[Union[
244
+ Literal["none"],
245
+ Literal["auto"],
246
+ Literal["required"],
247
+ ChatCompletionNamedToolChoiceParam,
248
+ ]] = "none"
249
+
250
+ # NOTE this will be ignored by vLLM -- the model determines the behavior
251
+ parallel_tool_calls: Optional[bool] = False
252
+ user: Optional[str] = None
253
+
254
+ # --8<-- [start:chat-completion-sampling-params]
255
+ best_of: Optional[int] = None
256
+ use_beam_search: bool = False
257
+ top_k: Optional[int] = None
258
+ min_p: Optional[float] = None
259
+ repetition_penalty: Optional[float] = None
260
+ length_penalty: float = 1.0
261
+ stop_token_ids: Optional[list[int]] = Field(default_factory=list)
262
+ include_stop_str_in_output: bool = False
263
+ ignore_eos: bool = False
264
+ min_tokens: int = 0
265
+ skip_special_tokens: bool = True
266
+ spaces_between_special_tokens: bool = True
267
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
268
+ prompt_logprobs: Optional[int] = None
269
+ # --8<-- [end:chat-completion-sampling-params]
270
+
271
+ # --8<-- [start:chat-completion-extra-params]
272
+ echo: bool = Field(
273
+ default=False,
274
+ description=(
275
+ "If true, the new message will be prepended with the last message "
276
+ "if they belong to the same role."),
277
+ )
278
+ add_generation_prompt: bool = Field(
279
+ default=True,
280
+ description=
281
+ ("If true, the generation prompt will be added to the chat template. "
282
+ "This is a parameter used by chat template in tokenizer config of the "
283
+ "model."),
284
+ )
285
+ continue_final_message: bool = Field(
286
+ default=False,
287
+ description=
288
+ ("If this is set, the chat will be formatted so that the final "
289
+ "message in the chat is open-ended, without any EOS tokens. The "
290
+ "model will continue this message rather than starting a new one. "
291
+ "This allows you to \"prefill\" part of the model's response for it. "
292
+ "Cannot be used at the same time as `add_generation_prompt`."),
293
+ )
294
+ add_special_tokens: bool = Field(
295
+ default=False,
296
+ description=(
297
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
298
+ "on top of what is added by the chat template. "
299
+ "For most models, the chat template takes care of adding the "
300
+ "special tokens so this should be set to false (as is the "
301
+ "default)."),
302
+ )
303
+ documents: Optional[list[dict[str, str]]] = Field(
304
+ default=None,
305
+ description=
306
+ ("A list of dicts representing documents that will be accessible to "
307
+ "the model if it is performing RAG (retrieval-augmented generation)."
308
+ " If the template does not support RAG, this argument will have no "
309
+ "effect. We recommend that each document should be a dict containing "
310
+ "\"title\" and \"text\" keys."),
311
+ )
312
+ chat_template: Optional[str] = Field(
313
+ default=None,
314
+ description=(
315
+ "A Jinja template to use for this conversion. "
316
+ "As of transformers v4.44, default chat template is no longer "
317
+ "allowed, so you must provide a chat template if the tokenizer "
318
+ "does not define one."),
319
+ )
320
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
321
+ default=None,
322
+ description=("Additional kwargs to pass to the template renderer. "
323
+ "Will be accessible by the chat template."),
324
+ )
325
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
326
+ default=None,
327
+ description=("Additional kwargs to pass to the HF processor."),
328
+ )
329
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
330
+ default=None,
331
+ description=("If specified, the output will follow the JSON schema."),
332
+ )
333
+ guided_regex: Optional[str] = Field(
334
+ default=None,
335
+ description=(
336
+ "If specified, the output will follow the regex pattern."),
337
+ )
338
+ guided_choice: Optional[list[str]] = Field(
339
+ default=None,
340
+ description=(
341
+ "If specified, the output will be exactly one of the choices."),
342
+ )
343
+ guided_grammar: Optional[str] = Field(
344
+ default=None,
345
+ description=(
346
+ "If specified, the output will follow the context free grammar."),
347
+ )
348
+ structural_tag: Optional[str] = Field(
349
+ default=None,
350
+ description=(
351
+ "If specified, the output will follow the structural tag schema."),
352
+ )
353
+ guided_decoding_backend: Optional[str] = Field(
354
+ default=None,
355
+ description=(
356
+ "If specified, will override the default guided decoding backend "
357
+ "of the server for this specific request. If set, must be either "
358
+ "'outlines' / 'lm-format-enforcer'"),
359
+ )
360
+ guided_whitespace_pattern: Optional[str] = Field(
361
+ default=None,
362
+ description=(
363
+ "If specified, will override the default whitespace pattern "
364
+ "for guided json decoding."),
365
+ )
366
+ priority: int = Field(
367
+ default=0,
368
+ description=(
369
+ "The priority of the request (lower means earlier handling; "
370
+ "default: 0). Any priority other than 0 will raise an error "
371
+ "if the served model does not use priority scheduling."),
372
+ )
373
+ request_id: str = Field(
374
+ default_factory=lambda: f"{random_uuid()}",
375
+ description=(
376
+ "The request_id related to this request. If the caller does "
377
+ "not set it, a random_uuid will be generated. This id is used "
378
+ "through out the inference process and return in response."),
379
+ )
380
+ logits_processors: Optional[LogitsProcessors] = Field(
381
+ default=None,
382
+ description=(
383
+ "A list of either qualified names of logits processors, or "
384
+ "constructor objects, to apply when sampling. A constructor is "
385
+ "a JSON object with a required 'qualname' field specifying the "
386
+ "qualified name of the processor class/factory, and optional "
387
+ "'args' and 'kwargs' fields containing positional and keyword "
388
+ "arguments. For example: {'qualname': "
389
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
390
+ "{'param': 'value'}}."))
391
+ return_tokens_as_token_ids: Optional[bool] = Field(
392
+ default=None,
393
+ description=(
394
+ "If specified with 'logprobs', tokens are represented "
395
+ " as strings of the form 'token_id:{token_id}' so that tokens "
396
+ "that are not JSON-encodable can be identified."))
397
+ cache_salt: Optional[str] = Field(
398
+ default=None,
399
+ description=(
400
+ "If specified, the prefix cache will be salted with the provided "
401
+ "string to prevent an attacker to guess prompts in multi-user "
402
+ "environments. The salt should be random, protected from "
403
+ "access by 3rd parties, and long enough to be "
404
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
405
+ "to 256 bit). Not supported by vLLM engine V0."))
406
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
407
+ default=None,
408
+ description="KVTransfer parameters used for disaggregated serving.")
409
+
410
+ # --8<-- [end:chat-completion-extra-params]
411
+
412
+ # Default sampling parameters for chat completion requests
413
+ _DEFAULT_SAMPLING_PARAMS: dict = {
414
+ "repetition_penalty": 1.0,
415
+ "temperature": 1.0,
416
+ "top_p": 1.0,
417
+ "top_k": 0,
418
+ "min_p": 0.0,
419
+ }
420
+
421
+ def to_beam_search_params(
422
+ self,
423
+ default_max_tokens: int,
424
+ default_sampling_params: Optional[dict] = None
425
+ ) -> BeamSearchParams:
426
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
427
+ max_tokens = self.max_completion_tokens or self.max_tokens
428
+
429
+ if default_sampling_params is None:
430
+ default_sampling_params = {}
431
+ n = self.n if self.n is not None else 1
432
+
433
+ # Use minimum of context window, user request & server limit.
434
+ max_tokens = min(
435
+ val for val in (default_max_tokens, max_tokens,
436
+ default_sampling_params.get("max_tokens", None))
437
+ if val is not None)
438
+
439
+ if (temperature := self.temperature) is None:
440
+ temperature = default_sampling_params.get(
441
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
442
+
443
+ return BeamSearchParams(
444
+ beam_width=n,
445
+ max_tokens=max_tokens,
446
+ ignore_eos=self.ignore_eos,
447
+ temperature=temperature,
448
+ length_penalty=self.length_penalty,
449
+ include_stop_str_in_output=self.include_stop_str_in_output,
450
+ )
451
+
452
+ def to_sampling_params(
453
+ self,
454
+ default_max_tokens: int,
455
+ logits_processor_pattern: Optional[str],
456
+ default_sampling_params: Optional[dict] = None,
457
+ ) -> SamplingParams:
458
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
459
+ max_tokens = self.max_completion_tokens or self.max_tokens
460
+
461
+ if default_sampling_params is None:
462
+ default_sampling_params = {}
463
+
464
+ # Use minimum of context window, user request & server limit.
465
+ max_tokens = min(
466
+ val for val in (default_max_tokens, max_tokens,
467
+ default_sampling_params.get("max_tokens", None))
468
+ if val is not None)
469
+
470
+ # Default parameters
471
+ if (repetition_penalty := self.repetition_penalty) is None:
472
+ repetition_penalty = default_sampling_params.get(
473
+ "repetition_penalty",
474
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
475
+ )
476
+ if (temperature := self.temperature) is None:
477
+ temperature = default_sampling_params.get(
478
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
479
+ if (top_p := self.top_p) is None:
480
+ top_p = default_sampling_params.get(
481
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
482
+ if (top_k := self.top_k) is None:
483
+ top_k = default_sampling_params.get(
484
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
485
+ if (min_p := self.min_p) is None:
486
+ min_p = default_sampling_params.get(
487
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
488
+
489
+ prompt_logprobs = self.prompt_logprobs
490
+ if prompt_logprobs is None and self.echo:
491
+ prompt_logprobs = self.top_logprobs
492
+
493
+ guided_json_object = None
494
+ if self.response_format is not None:
495
+ if self.response_format.type == "json_object":
496
+ guided_json_object = True
497
+ elif self.response_format.type == "json_schema":
498
+ json_schema = self.response_format.json_schema
499
+ assert json_schema is not None
500
+ self.guided_json = json_schema.json_schema
501
+ elif self.response_format.type == "structural_tag":
502
+ structural_tag = self.response_format
503
+ assert structural_tag is not None and isinstance(
504
+ structural_tag, StructuralTagResponseFormat)
505
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
506
+ self.structural_tag = json.dumps(s_tag_obj)
507
+
508
+ guided_decoding = GuidedDecodingParams.from_optional(
509
+ json=self._get_guided_json_from_tool() or self.guided_json,
510
+ regex=self.guided_regex,
511
+ choice=self.guided_choice,
512
+ grammar=self.guided_grammar,
513
+ json_object=guided_json_object,
514
+ backend=self.guided_decoding_backend,
515
+ whitespace_pattern=self.guided_whitespace_pattern,
516
+ structural_tag=self.structural_tag,
517
+ )
518
+
519
+ return SamplingParams.from_optional(
520
+ n=self.n,
521
+ best_of=self.best_of,
522
+ presence_penalty=self.presence_penalty,
523
+ frequency_penalty=self.frequency_penalty,
524
+ repetition_penalty=repetition_penalty,
525
+ temperature=temperature,
526
+ top_p=top_p,
527
+ top_k=top_k,
528
+ min_p=min_p,
529
+ seed=self.seed,
530
+ stop=self.stop,
531
+ stop_token_ids=self.stop_token_ids,
532
+ logprobs=self.top_logprobs if self.logprobs else None,
533
+ prompt_logprobs=prompt_logprobs,
534
+ ignore_eos=self.ignore_eos,
535
+ max_tokens=max_tokens,
536
+ min_tokens=self.min_tokens,
537
+ skip_special_tokens=self.skip_special_tokens,
538
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
539
+ logits_processors=get_logits_processors(self.logits_processors,
540
+ logits_processor_pattern),
541
+ include_stop_str_in_output=self.include_stop_str_in_output,
542
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
543
+ output_kind=RequestOutputKind.DELTA if self.stream \
544
+ else RequestOutputKind.FINAL_ONLY,
545
+ guided_decoding=guided_decoding,
546
+ logit_bias=self.logit_bias,
547
+ extra_args=({"kv_transfer_params": self.kv_transfer_params}
548
+ if self.kv_transfer_params else None))
549
+
550
+ def _get_guided_json_from_tool(
551
+ self) -> Optional[Union[str, dict, BaseModel]]:
552
+ # user has chosen to not use any tool
553
+ if self.tool_choice == "none" or self.tools is None:
554
+ return None
555
+
556
+ # user has chosen to use a named tool
557
+ if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
558
+ tool_name = self.tool_choice.function.name
559
+ tools = {tool.function.name: tool.function for tool in self.tools}
560
+ if tool_name not in tools:
561
+ raise ValueError(
562
+ f"Tool '{tool_name}' has not been passed in `tools`.")
563
+ tool = tools[tool_name]
564
+ return tool.parameters
565
+
566
+ if self.tool_choice == "required":
567
+ # Pydantic schema generation cannot be used since the JSON schema
568
+ # has to be constructed for a specific instantiation of a tool list
569
+ # so that parameters of a function are correctly generated
570
+ # based on the chosen function name
571
+ def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
572
+ return {
573
+ "properties": {
574
+ "name": {
575
+ "type": "string",
576
+ "enum": [tool.function.name]
577
+ },
578
+ # parameters are always generated as '{}' in the final
579
+ # output if they are missing from the request
580
+ # (i.e. are None or '{}') so the schema is
581
+ # updated to produce an empty object in that case
582
+ "parameters": tool.function.parameters
583
+ if tool.function.parameters else {
584
+ "type": "object",
585
+ "properties": {}
586
+ }
587
+ },
588
+ "required": ["name", "parameters"]
589
+ }
590
+
591
+ json_schema = {
592
+ "type": "array",
593
+ "minItems": 1,
594
+ "items": {
595
+ "type": "object",
596
+ "anyOf": [get_tool_schema(tool) for tool in self.tools]
597
+ }
598
+ }
599
+ return json_schema
600
+
601
+ return None
602
+
603
+ @model_validator(mode="before")
604
+ @classmethod
605
+ def validate_stream_options(cls, data):
606
+ if data.get("stream_options") and not data.get("stream"):
607
+ raise ValueError(
608
+ "Stream options can only be defined when `stream=True`.")
609
+
610
+ return data
611
+
612
+ @model_validator(mode="before")
613
+ @classmethod
614
+ def check_logprobs(cls, data):
615
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
616
+ if data.get("stream") and prompt_logprobs > 0:
617
+ raise ValueError(
618
+ "`prompt_logprobs` are not available when `stream=True`.")
619
+
620
+ if prompt_logprobs < 0:
621
+ raise ValueError("`prompt_logprobs` must be a positive value.")
622
+
623
+ if (top_logprobs := data.get("top_logprobs")) is not None:
624
+ if top_logprobs < 0:
625
+ raise ValueError("`top_logprobs` must be a positive value.")
626
+
627
+ if top_logprobs > 0 and not data.get("logprobs"):
628
+ raise ValueError(
629
+ "when using `top_logprobs`, `logprobs` must be set to true."
630
+ )
631
+
632
+ return data
633
+
634
+ @model_validator(mode="before")
635
+ @classmethod
636
+ def check_guided_decoding_count(cls, data):
637
+ if isinstance(data, ValueError):
638
+ raise data
639
+
640
+ guide_count = sum([
641
+ "guided_json" in data and data["guided_json"] is not None,
642
+ "guided_regex" in data and data["guided_regex"] is not None,
643
+ "guided_choice" in data and data["guided_choice"] is not None
644
+ ])
645
+ # you can only use one kind of guided decoding
646
+ if guide_count > 1:
647
+ raise ValueError(
648
+ "You can only use one kind of guided decoding "
649
+ "('guided_json', 'guided_regex' or 'guided_choice').")
650
+ # you can only either use guided decoding or tools, not both
651
+ if guide_count > 1 and data.get("tool_choice", "none") not in (
652
+ "none",
653
+ "auto",
654
+ "required",
655
+ ):
656
+ raise ValueError(
657
+ "You can only either use guided decoding or tools, not both.")
658
+ return data
659
+
660
+ @model_validator(mode="before")
661
+ @classmethod
662
+ def check_tool_usage(cls, data):
663
+
664
+ # if "tool_choice" is not specified but tools are provided,
665
+ # default to "auto" tool_choice
666
+ if "tool_choice" not in data and data.get("tools"):
667
+ data["tool_choice"] = "auto"
668
+
669
+ # if "tool_choice" is "none" -- ignore tools if present
670
+ if "tool_choice" in data and data["tool_choice"] == "none":
671
+ # ensure that no tools are present
672
+ data.pop("tools", None)
673
+ return data
674
+
675
+ # if "tool_choice" is specified -- validation
676
+ if "tool_choice" in data:
677
+
678
+ # ensure that if "tool choice" is specified, tools are present
679
+ if "tools" not in data or data["tools"] is None:
680
+ raise ValueError(
681
+ "When using `tool_choice`, `tools` must be set.")
682
+
683
+ # make sure that tool choice is either a named tool
684
+ # OR that it's set to "auto" or "required"
685
+ if data["tool_choice"] not in [
686
+ "auto", "required"
687
+ ] and not isinstance(data["tool_choice"], dict):
688
+ raise NotImplementedError(
689
+ f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
690
+ 'Only named tools, "none", "auto" or "required" '\
691
+ 'are supported.'
692
+ )
693
+
694
+ # ensure that if "tool_choice" is specified as an object,
695
+ # it matches a valid tool
696
+ if isinstance(data["tool_choice"], dict):
697
+ valid_tool = False
698
+ specified_function = data["tool_choice"].get("function")
699
+ if not specified_function:
700
+ raise ValueError(
701
+ "Expected field `function` in `tool_choice`."
702
+ " Correct usage: `{\"type\": \"function\","
703
+ " \"function\": {\"name\": \"my_function\"}}`")
704
+ specified_function_name = specified_function.get("name")
705
+ if not specified_function_name:
706
+ raise ValueError(
707
+ "Expected field `name` in `function` in `tool_choice`."
708
+ "Correct usage: `{\"type\": \"function\", "
709
+ "\"function\": {\"name\": \"my_function\"}}`")
710
+ for tool in data["tools"]:
711
+ if tool["function"]["name"] == specified_function_name:
712
+ valid_tool = True
713
+ break
714
+ if not valid_tool:
715
+ raise ValueError(
716
+ "The tool specified in `tool_choice` does not match any"
717
+ " of the specified `tools`")
718
+ return data
719
+
720
+ @model_validator(mode="before")
721
+ @classmethod
722
+ def check_generation_prompt(cls, data):
723
+ if data.get("continue_final_message") and data.get(
724
+ "add_generation_prompt"):
725
+ raise ValueError("Cannot set both `continue_final_message` and "
726
+ "`add_generation_prompt` to True.")
727
+ return data
728
+
729
+ @model_validator(mode="before")
730
+ @classmethod
731
+ def check_cache_salt_support(cls, data):
732
+ if data.get("cache_salt") is not None:
733
+ if not envs.VLLM_USE_V1:
734
+ raise ValueError(
735
+ "Parameter 'cache_salt' is not supported with "
736
+ "this instance of vLLM, which uses engine V0.")
737
+ if not isinstance(data["cache_salt"],
738
+ str) or not data["cache_salt"]:
739
+ raise ValueError("Parameter 'cache_salt' must be a "
740
+ "non-empty string if provided.")
741
+ return data
742
+
743
+
744
+ class CompletionRequest(OpenAIBaseModel):
745
+ # Ordered by official OpenAI API documentation
746
+ # https://platform.openai.com/docs/api-reference/completions/create
747
+ model: Optional[str] = None
748
+ prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
749
+ prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
750
+ best_of: Optional[int] = None
751
+ echo: Optional[bool] = False
752
+ frequency_penalty: Optional[float] = 0.0
753
+ logit_bias: Optional[dict[str, float]] = None
754
+ logprobs: Optional[int] = None
755
+ max_tokens: Optional[int] = 16
756
+ n: int = 1
757
+ presence_penalty: Optional[float] = 0.0
758
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
759
+ stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
760
+ stream: Optional[bool] = False
761
+ stream_options: Optional[StreamOptions] = None
762
+ suffix: Optional[str] = None
763
+ temperature: Optional[float] = None
764
+ top_p: Optional[float] = None
765
+ user: Optional[str] = None
766
+
767
+ # --8<-- [start:completion-sampling-params]
768
+ use_beam_search: bool = False
769
+ top_k: Optional[int] = None
770
+ min_p: Optional[float] = None
771
+ repetition_penalty: Optional[float] = None
772
+ length_penalty: float = 1.0
773
+ stop_token_ids: Optional[list[int]] = Field(default_factory=list)
774
+ include_stop_str_in_output: bool = False
775
+ ignore_eos: bool = False
776
+ min_tokens: int = 0
777
+ skip_special_tokens: bool = True
778
+ spaces_between_special_tokens: bool = True
779
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
780
+ allowed_token_ids: Optional[list[int]] = None
781
+ prompt_logprobs: Optional[int] = None
782
+ # --8<-- [end:completion-sampling-params]
783
+
784
+ # --8<-- [start:completion-extra-params]
785
+ add_special_tokens: bool = Field(
786
+ default=True,
787
+ description=(
788
+ "If true (the default), special tokens (e.g. BOS) will be added to "
789
+ "the prompt."),
790
+ )
791
+ response_format: Optional[AnyResponseFormat] = Field(
792
+ default=None,
793
+ description=(
794
+ "Similar to chat completion, this parameter specifies the format "
795
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
796
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
797
+ ),
798
+ )
799
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
800
+ default=None,
801
+ description="If specified, the output will follow the JSON schema.",
802
+ )
803
+ guided_regex: Optional[str] = Field(
804
+ default=None,
805
+ description=(
806
+ "If specified, the output will follow the regex pattern."),
807
+ )
808
+ guided_choice: Optional[list[str]] = Field(
809
+ default=None,
810
+ description=(
811
+ "If specified, the output will be exactly one of the choices."),
812
+ )
813
+ guided_grammar: Optional[str] = Field(
814
+ default=None,
815
+ description=(
816
+ "If specified, the output will follow the context free grammar."),
817
+ )
818
+ guided_decoding_backend: Optional[str] = Field(
819
+ default=None,
820
+ description=(
821
+ "If specified, will override the default guided decoding backend "
822
+ "of the server for this specific request. If set, must be one of "
823
+ "'outlines' / 'lm-format-enforcer'"),
824
+ )
825
+ guided_whitespace_pattern: Optional[str] = Field(
826
+ default=None,
827
+ description=(
828
+ "If specified, will override the default whitespace pattern "
829
+ "for guided json decoding."),
830
+ )
831
+ priority: int = Field(
832
+ default=0,
833
+ description=(
834
+ "The priority of the request (lower means earlier handling; "
835
+ "default: 0). Any priority other than 0 will raise an error "
836
+ "if the served model does not use priority scheduling."),
837
+ )
838
+ logits_processors: Optional[LogitsProcessors] = Field(
839
+ default=None,
840
+ description=(
841
+ "A list of either qualified names of logits processors, or "
842
+ "constructor objects, to apply when sampling. A constructor is "
843
+ "a JSON object with a required 'qualname' field specifying the "
844
+ "qualified name of the processor class/factory, and optional "
845
+ "'args' and 'kwargs' fields containing positional and keyword "
846
+ "arguments. For example: {'qualname': "
847
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
848
+ "{'param': 'value'}}."))
849
+
850
+ return_tokens_as_token_ids: Optional[bool] = Field(
851
+ default=None,
852
+ description=(
853
+ "If specified with 'logprobs', tokens are represented "
854
+ " as strings of the form 'token_id:{token_id}' so that tokens "
855
+ "that are not JSON-encodable can be identified."))
856
+
857
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
858
+ default=None,
859
+ description="KVTransfer parameters used for disaggregated serving.")
860
+
861
+ # --8<-- [end:completion-extra-params]
862
+
863
+ # Default sampling parameters for completion requests
864
+ _DEFAULT_SAMPLING_PARAMS: dict = {
865
+ "repetition_penalty": 1.0,
866
+ "temperature": 1.0,
867
+ "top_p": 1.0,
868
+ "top_k": 0,
869
+ "min_p": 0.0,
870
+ }
871
+
872
+ def to_beam_search_params(
873
+ self,
874
+ default_max_tokens: int,
875
+ default_sampling_params: Optional[dict] = None
876
+ ) -> BeamSearchParams:
877
+ max_tokens = self.max_tokens
878
+
879
+ if default_sampling_params is None:
880
+ default_sampling_params = {}
881
+ n = self.n if self.n is not None else 1
882
+
883
+ # Use minimum of context window, user request & server limit.
884
+ max_tokens = min(
885
+ val for val in (default_max_tokens, max_tokens,
886
+ default_sampling_params.get("max_tokens", None))
887
+ if val is not None)
888
+
889
+ if (temperature := self.temperature) is None:
890
+ temperature = default_sampling_params.get("temperature", 1.0)
891
+
892
+ return BeamSearchParams(
893
+ beam_width=n,
894
+ max_tokens=max_tokens,
895
+ ignore_eos=self.ignore_eos,
896
+ temperature=temperature,
897
+ length_penalty=self.length_penalty,
898
+ include_stop_str_in_output=self.include_stop_str_in_output,
899
+ )
900
+
901
+ def to_sampling_params(
902
+ self,
903
+ default_max_tokens: int,
904
+ logits_processor_pattern: Optional[str],
905
+ default_sampling_params: Optional[dict] = None,
906
+ ) -> SamplingParams:
907
+ max_tokens = self.max_tokens
908
+
909
+ if default_sampling_params is None:
910
+ default_sampling_params = {}
911
+
912
+ # Use minimum of context window, user request & server limit.
913
+ max_tokens = min(
914
+ val for val in (default_max_tokens, max_tokens,
915
+ default_sampling_params.get("max_tokens", None))
916
+ if val is not None)
917
+
918
+ # Default parameters
919
+ if (repetition_penalty := self.repetition_penalty) is None:
920
+ repetition_penalty = default_sampling_params.get(
921
+ "repetition_penalty",
922
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
923
+ )
924
+ if (temperature := self.temperature) is None:
925
+ temperature = default_sampling_params.get(
926
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
927
+ if (top_p := self.top_p) is None:
928
+ top_p = default_sampling_params.get(
929
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
930
+ if (top_k := self.top_k) is None:
931
+ top_k = default_sampling_params.get(
932
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
933
+ if (min_p := self.min_p) is None:
934
+ min_p = default_sampling_params.get(
935
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
936
+
937
+ prompt_logprobs = self.prompt_logprobs
938
+ if prompt_logprobs is None and self.echo:
939
+ prompt_logprobs = self.logprobs
940
+
941
+ echo_without_generation = self.echo and self.max_tokens == 0
942
+
943
+ guided_json_object = None
944
+ if (self.response_format is not None
945
+ and self.response_format.type == "json_object"):
946
+ guided_json_object = True
947
+
948
+ guided_decoding = GuidedDecodingParams.from_optional(
949
+ json=self.guided_json,
950
+ regex=self.guided_regex,
951
+ choice=self.guided_choice,
952
+ grammar=self.guided_grammar,
953
+ json_object=guided_json_object,
954
+ backend=self.guided_decoding_backend,
955
+ whitespace_pattern=self.guided_whitespace_pattern,
956
+ )
957
+
958
+ return SamplingParams.from_optional(
959
+ n=self.n,
960
+ best_of=self.best_of,
961
+ presence_penalty=self.presence_penalty,
962
+ frequency_penalty=self.frequency_penalty,
963
+ repetition_penalty=repetition_penalty,
964
+ temperature=temperature,
965
+ top_p=top_p,
966
+ top_k=top_k,
967
+ min_p=min_p,
968
+ seed=self.seed,
969
+ stop=self.stop,
970
+ stop_token_ids=self.stop_token_ids,
971
+ logprobs=self.logprobs,
972
+ ignore_eos=self.ignore_eos,
973
+ max_tokens=max_tokens if not echo_without_generation else 1,
974
+ min_tokens=self.min_tokens,
975
+ prompt_logprobs=prompt_logprobs,
976
+ skip_special_tokens=self.skip_special_tokens,
977
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
978
+ include_stop_str_in_output=self.include_stop_str_in_output,
979
+ logits_processors=get_logits_processors(self.logits_processors,
980
+ logits_processor_pattern),
981
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
982
+ output_kind=RequestOutputKind.DELTA if self.stream \
983
+ else RequestOutputKind.FINAL_ONLY,
984
+ guided_decoding=guided_decoding,
985
+ logit_bias=self.logit_bias,
986
+ allowed_token_ids=self.allowed_token_ids,
987
+ extra_args=({"kv_transfer_params": self.kv_transfer_params}
988
+ if self.kv_transfer_params else None))
989
+
990
+ @model_validator(mode="before")
991
+ @classmethod
992
+ def check_guided_decoding_count(cls, data):
993
+ guide_count = sum([
994
+ "guided_json" in data and data["guided_json"] is not None,
995
+ "guided_regex" in data and data["guided_regex"] is not None,
996
+ "guided_choice" in data and data["guided_choice"] is not None
997
+ ])
998
+ if guide_count > 1:
999
+ raise ValueError(
1000
+ "You can only use one kind of guided decoding "
1001
+ "('guided_json', 'guided_regex' or 'guided_choice').")
1002
+ return data
1003
+
1004
+ @model_validator(mode="before")
1005
+ @classmethod
1006
+ def check_logprobs(cls, data):
1007
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1008
+ if data.get("stream") and prompt_logprobs > 0:
1009
+ raise ValueError(
1010
+ "`prompt_logprobs` are not available when `stream=True`.")
1011
+
1012
+ if prompt_logprobs < 0:
1013
+ raise ValueError("`prompt_logprobs` must be a positive value.")
1014
+
1015
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1016
+ raise ValueError("`logprobs` must be a positive value.")
1017
+
1018
+ return data
1019
+
1020
+ @model_validator(mode="before")
1021
+ @classmethod
1022
+ def validate_stream_options(cls, data):
1023
+ if data.get("stream_options") and not data.get("stream"):
1024
+ raise ValueError(
1025
+ "Stream options can only be defined when `stream=True`.")
1026
+
1027
+ return data
1028
+
1029
+ @model_validator(mode="before")
1030
+ @classmethod
1031
+ def validate_prompt_and_prompt_embeds(cls, data):
1032
+ if data.get("prompt") is None and data.get("prompt_embeds") is None:
1033
+ raise ValueError(
1034
+ "At least one of `prompt` or `prompt_embeds` must be set.")
1035
+ return data
1036
+
1037
+
1038
+ class EmbeddingCompletionRequest(OpenAIBaseModel):
1039
+ # Ordered by official OpenAI API documentation
1040
+ # https://platform.openai.com/docs/api-reference/embeddings
1041
+ model: Optional[str] = None
1042
+ input: Union[list[int], list[list[int]], str, list[str]]
1043
+ encoding_format: Literal["float", "base64"] = "float"
1044
+ dimensions: Optional[int] = None
1045
+ user: Optional[str] = None
1046
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1047
+
1048
+ # --8<-- [start:embedding-pooling-params]
1049
+ additional_data: Optional[Any] = None
1050
+ # --8<-- [end:embedding-pooling-params]
1051
+
1052
+ # --8<-- [start:embedding-extra-params]
1053
+ add_special_tokens: bool = Field(
1054
+ default=True,
1055
+ description=(
1056
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1057
+ "the prompt."),
1058
+ )
1059
+ priority: int = Field(
1060
+ default=0,
1061
+ description=(
1062
+ "The priority of the request (lower means earlier handling; "
1063
+ "default: 0). Any priority other than 0 will raise an error "
1064
+ "if the served model does not use priority scheduling."),
1065
+ )
1066
+
1067
+ # --8<-- [end:embedding-extra-params]
1068
+
1069
+ def to_pooling_params(self):
1070
+ return PoolingParams(dimensions=self.dimensions,
1071
+ additional_data=self.additional_data)
1072
+
1073
+
1074
+ class EmbeddingChatRequest(OpenAIBaseModel):
1075
+ model: Optional[str] = None
1076
+ messages: list[ChatCompletionMessageParam]
1077
+
1078
+ encoding_format: Literal["float", "base64"] = "float"
1079
+ dimensions: Optional[int] = None
1080
+ user: Optional[str] = None
1081
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1082
+
1083
+ # --8<-- [start:chat-embedding-pooling-params]
1084
+ additional_data: Optional[Any] = None
1085
+ # --8<-- [end:chat-embedding-pooling-params]
1086
+
1087
+ # --8<-- [start:chat-embedding-extra-params]
1088
+ add_special_tokens: bool = Field(
1089
+ default=False,
1090
+ description=(
1091
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1092
+ "on top of what is added by the chat template. "
1093
+ "For most models, the chat template takes care of adding the "
1094
+ "special tokens so this should be set to false (as is the "
1095
+ "default)."),
1096
+ )
1097
+ chat_template: Optional[str] = Field(
1098
+ default=None,
1099
+ description=(
1100
+ "A Jinja template to use for this conversion. "
1101
+ "As of transformers v4.44, default chat template is no longer "
1102
+ "allowed, so you must provide a chat template if the tokenizer "
1103
+ "does not define one."),
1104
+ )
1105
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1106
+ default=None,
1107
+ description=("Additional kwargs to pass to the template renderer. "
1108
+ "Will be accessible by the chat template."),
1109
+ )
1110
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1111
+ default=None,
1112
+ description=("Additional kwargs to pass to the HF processor."),
1113
+ )
1114
+ priority: int = Field(
1115
+ default=0,
1116
+ description=(
1117
+ "The priority of the request (lower means earlier handling; "
1118
+ "default: 0). Any priority other than 0 will raise an error "
1119
+ "if the served model does not use priority scheduling."),
1120
+ )
1121
+ # --8<-- [end:chat-embedding-extra-params]
1122
+
1123
+ @model_validator(mode="before")
1124
+ @classmethod
1125
+ def check_generation_prompt(cls, data):
1126
+ if data.get("continue_final_message") and data.get(
1127
+ "add_generation_prompt"):
1128
+ raise ValueError("Cannot set both `continue_final_message` and "
1129
+ "`add_generation_prompt` to True.")
1130
+ return data
1131
+
1132
+ def to_pooling_params(self):
1133
+ return PoolingParams(dimensions=self.dimensions,
1134
+ additional_data=self.additional_data)
1135
+
1136
+
1137
+ EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
1138
+
1139
+ PoolingCompletionRequest = EmbeddingCompletionRequest
1140
+ PoolingChatRequest = EmbeddingChatRequest
1141
+ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
1142
+
1143
+
1144
+ class ScoreRequest(OpenAIBaseModel):
1145
+ model: Optional[str] = None
1146
+ text_1: Union[list[str], str]
1147
+ text_2: Union[list[str], str]
1148
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1149
+
1150
+ # --8<-- [start:score-pooling-params]
1151
+ additional_data: Optional[Any] = None
1152
+ # --8<-- [end:score-pooling-params]
1153
+
1154
+ # --8<-- [start:score-extra-params]
1155
+ priority: int = Field(
1156
+ default=0,
1157
+ description=(
1158
+ "The priority of the request (lower means earlier handling; "
1159
+ "default: 0). Any priority other than 0 will raise an error "
1160
+ "if the served model does not use priority scheduling."),
1161
+ )
1162
+
1163
+ # --8<-- [end:score-extra-params]
1164
+
1165
+ def to_pooling_params(self):
1166
+ return PoolingParams(additional_data=self.additional_data)
1167
+
1168
+
1169
+ class RerankRequest(OpenAIBaseModel):
1170
+ model: Optional[str] = None
1171
+ query: str
1172
+ documents: list[str]
1173
+ top_n: int = Field(default_factory=lambda: 0)
1174
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1175
+
1176
+ # --8<-- [start:rerank-pooling-params]
1177
+ additional_data: Optional[Any] = None
1178
+ # --8<-- [end:rerank-pooling-params]
1179
+
1180
+ # --8<-- [start:rerank-extra-params]
1181
+ priority: int = Field(
1182
+ default=0,
1183
+ description=(
1184
+ "The priority of the request (lower means earlier handling; "
1185
+ "default: 0). Any priority other than 0 will raise an error "
1186
+ "if the served model does not use priority scheduling."),
1187
+ )
1188
+
1189
+ # --8<-- [end:rerank-extra-params]
1190
+
1191
+ def to_pooling_params(self):
1192
+ return PoolingParams(additional_data=self.additional_data)
1193
+
1194
+
1195
+ class RerankDocument(BaseModel):
1196
+ text: str
1197
+
1198
+
1199
+ class RerankResult(BaseModel):
1200
+ index: int
1201
+ document: RerankDocument
1202
+ relevance_score: float
1203
+
1204
+
1205
+ class RerankUsage(BaseModel):
1206
+ total_tokens: int
1207
+
1208
+
1209
+ class RerankResponse(OpenAIBaseModel):
1210
+ id: str
1211
+ model: str
1212
+ usage: RerankUsage
1213
+ results: list[RerankResult]
1214
+
1215
+
1216
+ class CompletionLogProbs(OpenAIBaseModel):
1217
+ text_offset: list[int] = Field(default_factory=list)
1218
+ token_logprobs: list[Optional[float]] = Field(default_factory=list)
1219
+ tokens: list[str] = Field(default_factory=list)
1220
+ top_logprobs: list[Optional[dict[str,
1221
+ float]]] = Field(default_factory=list)
1222
+
1223
+
1224
+ class CompletionResponseChoice(OpenAIBaseModel):
1225
+ index: int
1226
+ text: str
1227
+ logprobs: Optional[CompletionLogProbs] = None
1228
+ finish_reason: Optional[str] = None
1229
+ stop_reason: Optional[Union[int, str]] = Field(
1230
+ default=None,
1231
+ description=(
1232
+ "The stop string or token id that caused the completion "
1233
+ "to stop, None if the completion finished for some other reason "
1234
+ "including encountering the EOS token"),
1235
+ )
1236
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1237
+
1238
+
1239
+ class CompletionResponse(OpenAIBaseModel):
1240
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1241
+ object: str = "text_completion"
1242
+ created: int = Field(default_factory=lambda: int(time.time()))
1243
+ model: str
1244
+ choices: list[CompletionResponseChoice]
1245
+ usage: UsageInfo
1246
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1247
+ default=None, description="KVTransfer parameters.")
1248
+
1249
+
1250
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1251
+ index: int
1252
+ text: str
1253
+ logprobs: Optional[CompletionLogProbs] = None
1254
+ finish_reason: Optional[str] = None
1255
+ stop_reason: Optional[Union[int, str]] = Field(
1256
+ default=None,
1257
+ description=(
1258
+ "The stop string or token id that caused the completion "
1259
+ "to stop, None if the completion finished for some other reason "
1260
+ "including encountering the EOS token"),
1261
+ )
1262
+
1263
+
1264
+ class CompletionStreamResponse(OpenAIBaseModel):
1265
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1266
+ object: str = "text_completion"
1267
+ created: int = Field(default_factory=lambda: int(time.time()))
1268
+ model: str
1269
+ choices: list[CompletionResponseStreamChoice]
1270
+ usage: Optional[UsageInfo] = Field(default=None)
1271
+
1272
+
1273
+ class EmbeddingResponseData(OpenAIBaseModel):
1274
+ index: int
1275
+ object: str = "embedding"
1276
+ embedding: Union[list[float], str]
1277
+
1278
+
1279
+ class EmbeddingResponse(OpenAIBaseModel):
1280
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1281
+ object: str = "list"
1282
+ created: int = Field(default_factory=lambda: int(time.time()))
1283
+ model: str
1284
+ data: list[EmbeddingResponseData]
1285
+ usage: UsageInfo
1286
+
1287
+
1288
+ class PoolingResponseData(OpenAIBaseModel):
1289
+ index: int
1290
+ object: str = "pooling"
1291
+ data: Union[list[list[float]], list[float], str]
1292
+
1293
+
1294
+ class PoolingResponse(OpenAIBaseModel):
1295
+ id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
1296
+ object: str = "list"
1297
+ created: int = Field(default_factory=lambda: int(time.time()))
1298
+ model: str
1299
+ data: list[PoolingResponseData]
1300
+ usage: UsageInfo
1301
+
1302
+
1303
+ class ScoreResponseData(OpenAIBaseModel):
1304
+ index: int
1305
+ object: str = "score"
1306
+ score: float
1307
+
1308
+
1309
+ class ScoreResponse(OpenAIBaseModel):
1310
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1311
+ object: str = "list"
1312
+ created: int = Field(default_factory=lambda: int(time.time()))
1313
+ model: str
1314
+ data: list[ScoreResponseData]
1315
+ usage: UsageInfo
1316
+
1317
+
1318
+ class ClassificationRequest(OpenAIBaseModel):
1319
+ model: Optional[str] = None
1320
+ input: Union[list[str], str]
1321
+ truncate_prompt_tokens: Optional[int] = None
1322
+ user: Optional[str] = None
1323
+
1324
+ # --8<-- [start:classification-pooling-params]
1325
+ additional_data: Optional[Any] = None
1326
+ # --8<-- [end:classification-pooling-params]
1327
+
1328
+ # --8<-- [start:classification-extra-params]
1329
+ priority: int = Field(
1330
+ default=0,
1331
+ description=(
1332
+ "The priority of the request (lower means earlier handling; "
1333
+ "default: 0). Any priority other than 0 will raise an error "
1334
+ "if the served model does not use priority scheduling."),
1335
+ )
1336
+
1337
+ # --8<-- [end:classification-extra-params]
1338
+
1339
+ def to_pooling_params(self):
1340
+ return PoolingParams(additional_data=self.additional_data)
1341
+
1342
+
1343
+ class ClassificationData(OpenAIBaseModel):
1344
+ index: int
1345
+ label: Optional[str]
1346
+ probs: list[float]
1347
+ num_classes: int
1348
+
1349
+
1350
+ class ClassificationResponse(OpenAIBaseModel):
1351
+ id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
1352
+ object: str = "list"
1353
+ created: int = Field(default_factory=lambda: int(time.time()))
1354
+ model: str
1355
+ data: list[ClassificationData]
1356
+ usage: UsageInfo
1357
+
1358
+
1359
+ class FunctionCall(OpenAIBaseModel):
1360
+ name: str
1361
+ arguments: str
1362
+
1363
+
1364
+ class ToolCall(OpenAIBaseModel):
1365
+ id: str = Field(default_factory=random_tool_call_id)
1366
+ type: Literal["function"] = "function"
1367
+ function: FunctionCall
1368
+
1369
+
1370
+ class DeltaFunctionCall(BaseModel):
1371
+ name: Optional[str] = None
1372
+ arguments: Optional[str] = None
1373
+
1374
+
1375
+ # a tool call delta where everything is optional
1376
+ class DeltaToolCall(OpenAIBaseModel):
1377
+ id: Optional[str] = None
1378
+ type: Optional[Literal["function"]] = None
1379
+ index: int
1380
+ function: Optional[DeltaFunctionCall] = None
1381
+
1382
+
1383
+ class ExtractedToolCallInformation(BaseModel):
1384
+ # indicate if tools were called
1385
+ tools_called: bool
1386
+
1387
+ # extracted tool calls
1388
+ tool_calls: list[ToolCall]
1389
+
1390
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1391
+ # But some models will do this intentionally
1392
+ content: Optional[str] = None
1393
+
1394
+
1395
+ class ChatMessage(OpenAIBaseModel):
1396
+ role: str
1397
+ reasoning_content: Optional[str] = None
1398
+ content: Optional[str] = None
1399
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1400
+
1401
+
1402
+ class ChatCompletionLogProb(OpenAIBaseModel):
1403
+ token: str
1404
+ logprob: float = -9999.0
1405
+ bytes: Optional[list[int]] = None
1406
+
1407
+
1408
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1409
+ # Workaround: redefine fields name cache so that it's not
1410
+ # shared with the super class.
1411
+ field_names: ClassVar[Optional[set[str]]] = None
1412
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1413
+
1414
+
1415
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1416
+ content: Optional[list[ChatCompletionLogProbsContent]] = None
1417
+
1418
+
1419
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1420
+ index: int
1421
+ message: ChatMessage
1422
+ logprobs: Optional[ChatCompletionLogProbs] = None
1423
+ # per OpenAI spec this is the default
1424
+ finish_reason: Optional[str] = "stop"
1425
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1426
+ stop_reason: Optional[Union[int, str]] = None
1427
+
1428
+
1429
+ class ChatCompletionResponse(OpenAIBaseModel):
1430
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1431
+ object: Literal["chat.completion"] = "chat.completion"
1432
+ created: int = Field(default_factory=lambda: int(time.time()))
1433
+ model: str
1434
+ choices: list[ChatCompletionResponseChoice]
1435
+ usage: UsageInfo
1436
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1437
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1438
+ default=None, description="KVTransfer parameters.")
1439
+
1440
+
1441
+ class DeltaMessage(OpenAIBaseModel):
1442
+ role: Optional[str] = None
1443
+ content: Optional[str] = None
1444
+ reasoning_content: Optional[str] = None
1445
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1446
+
1447
+
1448
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1449
+ index: int
1450
+ delta: DeltaMessage
1451
+ logprobs: Optional[ChatCompletionLogProbs] = None
1452
+ finish_reason: Optional[str] = None
1453
+ stop_reason: Optional[Union[int, str]] = None
1454
+
1455
+
1456
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1457
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1458
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1459
+ created: int = Field(default_factory=lambda: int(time.time()))
1460
+ model: str
1461
+ choices: list[ChatCompletionResponseStreamChoice]
1462
+ usage: Optional[UsageInfo] = Field(default=None)
1463
+
1464
+
1465
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1466
+ delta: DeltaMessage
1467
+ finish_reason: Optional[str] = None
1468
+ stop_reason: Optional[Union[int, str]] = None
1469
+
1470
+
1471
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1472
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1473
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1474
+ created: int = Field(default_factory=lambda: int(time.time()))
1475
+ model: str
1476
+ choices: list[TranscriptionResponseStreamChoice]
1477
+ usage: Optional[UsageInfo] = Field(default=None)
1478
+
1479
+
1480
+ class BatchRequestInput(OpenAIBaseModel):
1481
+ """
1482
+ The per-line object of the batch input file.
1483
+
1484
+ NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
1485
+ """
1486
+
1487
+ # A developer-provided per-request id that will be used to match outputs to
1488
+ # inputs. Must be unique for each request in a batch.
1489
+ custom_id: str
1490
+
1491
+ # The HTTP method to be used for the request. Currently only POST is
1492
+ # supported.
1493
+ method: str
1494
+
1495
+ # The OpenAI API relative URL to be used for the request. Currently
1496
+ # /v1/chat/completions is supported.
1497
+ url: str
1498
+
1499
+ # The parameters of the request.
1500
+ body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
1501
+
1502
+ @field_validator('body', mode='plain')
1503
+ @classmethod
1504
+ def check_type_for_url(cls, value: Any, info: ValidationInfo):
1505
+ # Use url to disambiguate models
1506
+ url = info.data['url']
1507
+ if url == "/v1/chat/completions":
1508
+ return ChatCompletionRequest.model_validate(value)
1509
+ if url == "/v1/embeddings":
1510
+ return TypeAdapter(EmbeddingRequest).validate_python(value)
1511
+ if url == "/v1/score":
1512
+ return ScoreRequest.model_validate(value)
1513
+ return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest,
1514
+ ScoreRequest]).validate_python(value)
1515
+
1516
+
1517
+ class BatchResponseData(OpenAIBaseModel):
1518
+ # HTTP status code of the response.
1519
+ status_code: int = 200
1520
+
1521
+ # An unique identifier for the API request.
1522
+ request_id: str
1523
+
1524
+ # The body of the response.
1525
+ body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
1526
+ ScoreResponse]] = None
1527
+
1528
+
1529
+ class BatchRequestOutput(OpenAIBaseModel):
1530
+ """
1531
+ The per-line object of the batch output and error files
1532
+ """
1533
+
1534
+ id: str
1535
+
1536
+ # A developer-provided per-request id that will be used to match outputs to
1537
+ # inputs.
1538
+ custom_id: str
1539
+
1540
+ response: Optional[BatchResponseData]
1541
+
1542
+ # For requests that failed with a non-HTTP error, this will contain more
1543
+ # information on the cause of the failure.
1544
+ error: Optional[Any]
1545
+
1546
+
1547
+ class TokenizeCompletionRequest(OpenAIBaseModel):
1548
+ model: Optional[str] = None
1549
+ prompt: str
1550
+
1551
+ add_special_tokens: bool = Field(
1552
+ default=True,
1553
+ description=(
1554
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1555
+ "the prompt."),
1556
+ )
1557
+
1558
+
1559
+ class TokenizeChatRequest(OpenAIBaseModel):
1560
+ model: Optional[str] = None
1561
+ messages: list[ChatCompletionMessageParam]
1562
+
1563
+ add_generation_prompt: bool = Field(
1564
+ default=True,
1565
+ description=
1566
+ ("If true, the generation prompt will be added to the chat template. "
1567
+ "This is a parameter used by chat template in tokenizer config of the "
1568
+ "model."),
1569
+ )
1570
+ continue_final_message: bool = Field(
1571
+ default=False,
1572
+ description=
1573
+ ("If this is set, the chat will be formatted so that the final "
1574
+ "message in the chat is open-ended, without any EOS tokens. The "
1575
+ "model will continue this message rather than starting a new one. "
1576
+ "This allows you to \"prefill\" part of the model's response for it. "
1577
+ "Cannot be used at the same time as `add_generation_prompt`."),
1578
+ )
1579
+ add_special_tokens: bool = Field(
1580
+ default=False,
1581
+ description=(
1582
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1583
+ "on top of what is added by the chat template. "
1584
+ "For most models, the chat template takes care of adding the "
1585
+ "special tokens so this should be set to false (as is the "
1586
+ "default)."),
1587
+ )
1588
+ chat_template: Optional[str] = Field(
1589
+ default=None,
1590
+ description=(
1591
+ "A Jinja template to use for this conversion. "
1592
+ "As of transformers v4.44, default chat template is no longer "
1593
+ "allowed, so you must provide a chat template if the tokenizer "
1594
+ "does not define one."),
1595
+ )
1596
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1597
+ default=None,
1598
+ description=("Additional kwargs to pass to the template renderer. "
1599
+ "Will be accessible by the chat template."),
1600
+ )
1601
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1602
+ default=None,
1603
+ description=("Additional kwargs to pass to the HF processor."),
1604
+ )
1605
+ tools: Optional[list[ChatCompletionToolsParam]] = Field(
1606
+ default=None,
1607
+ description=("A list of tools the model may call."),
1608
+ )
1609
+
1610
+ @model_validator(mode="before")
1611
+ @classmethod
1612
+ def check_generation_prompt(cls, data):
1613
+ if data.get("continue_final_message") and data.get(
1614
+ "add_generation_prompt"):
1615
+ raise ValueError("Cannot set both `continue_final_message` and "
1616
+ "`add_generation_prompt` to True.")
1617
+ return data
1618
+
1619
+
1620
+ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
1621
+
1622
+
1623
+ class TokenizeResponse(OpenAIBaseModel):
1624
+ count: int
1625
+ max_model_len: int
1626
+ tokens: list[int]
1627
+
1628
+
1629
+ class DetokenizeRequest(OpenAIBaseModel):
1630
+ model: Optional[str] = None
1631
+ tokens: list[int]
1632
+
1633
+
1634
+ class DetokenizeResponse(OpenAIBaseModel):
1635
+ prompt: str
1636
+
1637
+
1638
+ class LoadLoRAAdapterRequest(BaseModel):
1639
+ lora_name: str
1640
+ lora_path: str
1641
+
1642
+
1643
+ class UnloadLoRAAdapterRequest(BaseModel):
1644
+ lora_name: str
1645
+ lora_int_id: Optional[int] = Field(default=None)
1646
+
1647
+
1648
+ ## Protocols for Audio
1649
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
1650
+ "vtt"]
1651
+
1652
+
1653
+ class TranscriptionRequest(OpenAIBaseModel):
1654
+ # Ordered by official OpenAI API documentation
1655
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
1656
+
1657
+ file: UploadFile
1658
+ """
1659
+ The audio file object (not file name) to transcribe, in one of these
1660
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
1661
+ """
1662
+
1663
+ model: Optional[str] = None
1664
+ """ID of the model to use.
1665
+ """
1666
+
1667
+ language: Optional[str] = None
1668
+ """The language of the input audio.
1669
+
1670
+ Supplying the input language in
1671
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
1672
+ will improve accuracy and latency.
1673
+ """
1674
+
1675
+ prompt: str = Field(default="")
1676
+ """An optional text to guide the model's style or continue a previous audio
1677
+ segment.
1678
+
1679
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1680
+ should match the audio language.
1681
+ """
1682
+
1683
+ response_format: AudioResponseFormat = Field(default="json")
1684
+ """
1685
+ The format of the output, in one of these options: `json`, `text`, `srt`,
1686
+ `verbose_json`, or `vtt`.
1687
+ """
1688
+
1689
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
1690
+
1691
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
1692
+ alias="timestamp_granularities[]", default=[])
1693
+ """The timestamp granularities to populate for this transcription.
1694
+
1695
+ `response_format` must be set `verbose_json` to use timestamp granularities.
1696
+ Either or both of these options are supported: `word`, or `segment`. Note:
1697
+ There is no additional latency for segment timestamps, but generating word
1698
+ timestamps incurs additional latency.
1699
+ """
1700
+
1701
+ # --8<-- [start:transcription-extra-params]
1702
+ stream: Optional[bool] = False
1703
+ """Custom field not present in the original OpenAI definition. When set,
1704
+ it will enable output to be streamed in a similar fashion as the Chat
1705
+ Completion endpoint.
1706
+ """
1707
+ # Flattened stream option to simplify form data.
1708
+ stream_include_usage: Optional[bool] = False
1709
+ stream_continuous_usage_stats: Optional[bool] = False
1710
+ # --8<-- [end:transcription-extra-params]
1711
+
1712
+ # --8<-- [start:transcription-sampling-params]
1713
+ temperature: float = Field(default=0.0)
1714
+ """The sampling temperature, between 0 and 1.
1715
+
1716
+ Higher values like 0.8 will make the output more random, while lower values
1717
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
1718
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
1719
+ to automatically increase the temperature until certain thresholds are hit.
1720
+ """
1721
+
1722
+ top_p: Optional[float] = None
1723
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
1724
+ smallest possible set whose cumulative probability exceeds `p`.
1725
+ """
1726
+
1727
+ top_k: Optional[int] = None
1728
+ """Limits sampling to the `k` most probable tokens at each step."""
1729
+
1730
+ min_p: Optional[float] = None
1731
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
1732
+ minimum likelihood threshold during sampling.
1733
+ """
1734
+
1735
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1736
+ """The seed to use for sampling."""
1737
+
1738
+ frequency_penalty: Optional[float] = 0.0
1739
+ """The frequency penalty to use for sampling."""
1740
+
1741
+ repetition_penalty: Optional[float] = None
1742
+ """The repetition penalty to use for sampling."""
1743
+
1744
+ presence_penalty: Optional[float] = 0.0
1745
+ """The presence penalty to use for sampling."""
1746
+ # --8<-- [end:transcription-sampling-params]
1747
+
1748
+ # Default sampling parameters for transcription requests.
1749
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1750
+ "repetition_penalty": 1.0,
1751
+ "temperature": 1.0,
1752
+ "top_p": 1.0,
1753
+ "top_k": 0,
1754
+ "min_p": 0.0,
1755
+ }
1756
+
1757
+ def to_sampling_params(
1758
+ self,
1759
+ default_max_tokens: int,
1760
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
1761
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
1762
+ max_tokens = default_max_tokens
1763
+
1764
+ if default_sampling_params is None:
1765
+ default_sampling_params = {}
1766
+
1767
+ # Default parameters
1768
+ if (temperature := self.temperature) is None:
1769
+ temperature = default_sampling_params.get(
1770
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
1771
+ if (top_p := self.top_p) is None:
1772
+ top_p = default_sampling_params.get(
1773
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
1774
+ if (top_k := self.top_k) is None:
1775
+ top_k = default_sampling_params.get(
1776
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
1777
+ if (min_p := self.min_p) is None:
1778
+ min_p = default_sampling_params.get(
1779
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
1780
+
1781
+ if (repetition_penalty := self.repetition_penalty) is None:
1782
+ repetition_penalty = default_sampling_params.get(
1783
+ "repetition_penalty",
1784
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
1785
+
1786
+ return SamplingParams.from_optional(temperature=temperature,
1787
+ max_tokens=max_tokens,
1788
+ seed=self.seed,
1789
+ top_p=top_p,
1790
+ top_k=top_k,
1791
+ min_p=min_p,
1792
+ frequency_penalty=self.frequency_penalty,
1793
+ repetition_penalty=repetition_penalty,
1794
+ presence_penalty=self.presence_penalty,
1795
+ output_kind=RequestOutputKind.DELTA
1796
+ if self.stream \
1797
+ else RequestOutputKind.FINAL_ONLY)
1798
+
1799
+ @model_validator(mode="before")
1800
+ @classmethod
1801
+ def validate_transcription_request(cls, data):
1802
+ if isinstance(data.get("file"), str):
1803
+ raise HTTPException(
1804
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
1805
+ detail="Expected 'file' to be a file-like object, not 'str'.",
1806
+ )
1807
+
1808
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
1809
+ stream = data.get("stream", False)
1810
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
1811
+ raise ValueError(
1812
+ "Stream options can only be defined when `stream=True`.")
1813
+
1814
+ return data
1815
+
1816
+
1817
+ # Transcription response objects
1818
+ class TranscriptionResponse(OpenAIBaseModel):
1819
+ text: str
1820
+ """The transcribed text."""
1821
+
1822
+
1823
+ class TranscriptionWord(OpenAIBaseModel):
1824
+ end: float
1825
+ """End time of the word in seconds."""
1826
+
1827
+ start: float
1828
+ """Start time of the word in seconds."""
1829
+
1830
+ word: str
1831
+ """The text content of the word."""
1832
+
1833
+
1834
+ class TranscriptionSegment(OpenAIBaseModel):
1835
+ id: int
1836
+ """Unique identifier of the segment."""
1837
+
1838
+ avg_logprob: float
1839
+ """Average logprob of the segment.
1840
+
1841
+ If the value is lower than -1, consider the logprobs failed.
1842
+ """
1843
+
1844
+ compression_ratio: float
1845
+ """Compression ratio of the segment.
1846
+
1847
+ If the value is greater than 2.4, consider the compression failed.
1848
+ """
1849
+
1850
+ end: float
1851
+ """End time of the segment in seconds."""
1852
+
1853
+ no_speech_prob: float
1854
+ """Probability of no speech in the segment.
1855
+
1856
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
1857
+ this segment silent.
1858
+ """
1859
+
1860
+ seek: int
1861
+ """Seek offset of the segment."""
1862
+
1863
+ start: float
1864
+ """Start time of the segment in seconds."""
1865
+
1866
+ temperature: float
1867
+ """Temperature parameter used for generating the segment."""
1868
+
1869
+ text: str
1870
+ """Text content of the segment."""
1871
+
1872
+ tokens: list[int]
1873
+ """Array of token IDs for the text content."""
1874
+
1875
+
1876
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
1877
+ duration: str
1878
+ """The duration of the input audio."""
1879
+
1880
+ language: str
1881
+ """The language of the input audio."""
1882
+
1883
+ text: str
1884
+ """The transcribed text."""
1885
+
1886
+ segments: Optional[list[TranscriptionSegment]] = None
1887
+ """Segments of the transcribed text and their corresponding details."""
1888
+
1889
+ words: Optional[list[TranscriptionWord]] = None
1890
+ """Extracted words and their corresponding timestamps."""