vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1197) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +53 -0
  3. vllm/_custom_ops.py +1828 -0
  4. vllm/_ipex_ops.py +244 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +106 -0
  9. vllm/adapter_commons/request.py +26 -0
  10. vllm/adapter_commons/utils.py +93 -0
  11. vllm/adapter_commons/worker_manager.py +39 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +45 -0
  14. vllm/assets/base.py +41 -0
  15. vllm/assets/image.py +34 -0
  16. vllm/assets/video.py +115 -0
  17. vllm/attention/__init__.py +20 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +308 -0
  20. vllm/attention/backends/blocksparse_attn.py +461 -0
  21. vllm/attention/backends/cpu_mla.py +307 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1498 -0
  23. vllm/attention/backends/flash_attn.py +1003 -0
  24. vllm/attention/backends/flashinfer.py +1104 -0
  25. vllm/attention/backends/flashmla.py +244 -0
  26. vllm/attention/backends/hpu_attn.py +313 -0
  27. vllm/attention/backends/ipex_attn.py +398 -0
  28. vllm/attention/backends/mla/__init__.py +0 -0
  29. vllm/attention/backends/mla/common.py +1385 -0
  30. vllm/attention/backends/pallas.py +351 -0
  31. vllm/attention/backends/placeholder_attn.py +400 -0
  32. vllm/attention/backends/rocm_aiter_mla.py +435 -0
  33. vllm/attention/backends/rocm_flash_attn.py +975 -0
  34. vllm/attention/backends/torch_sdpa.py +703 -0
  35. vllm/attention/backends/triton_mla.py +115 -0
  36. vllm/attention/backends/utils.py +610 -0
  37. vllm/attention/backends/xformers.py +802 -0
  38. vllm/attention/layer.py +468 -0
  39. vllm/attention/ops/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  41. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +433 -0
  42. vllm/attention/ops/blocksparse_attention/interface.py +239 -0
  43. vllm/attention/ops/blocksparse_attention/utils.py +246 -0
  44. vllm/attention/ops/chunked_prefill_paged_decode.py +368 -0
  45. vllm/attention/ops/flashmla.py +116 -0
  46. vllm/attention/ops/hpu_paged_attn.py +88 -0
  47. vllm/attention/ops/ipex_attn.py +195 -0
  48. vllm/attention/ops/merge_attn_states.py +43 -0
  49. vllm/attention/ops/nki_flash_attn.py +906 -0
  50. vllm/attention/ops/paged_attn.py +256 -0
  51. vllm/attention/ops/prefix_prefill.py +902 -0
  52. vllm/attention/ops/rocm_aiter_mla.py +100 -0
  53. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  54. vllm/attention/ops/triton_decode_attention.py +674 -0
  55. vllm/attention/ops/triton_flash_attention.py +979 -0
  56. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  57. vllm/attention/ops/triton_unified_attention.py +334 -0
  58. vllm/attention/selector.py +187 -0
  59. vllm/attention/utils/fa_utils.py +55 -0
  60. vllm/beam_search.py +87 -0
  61. vllm/benchmarks/__init__.py +0 -0
  62. vllm/benchmarks/datasets.py +1185 -0
  63. vllm/benchmarks/endpoint_request_func.py +381 -0
  64. vllm/benchmarks/latency.py +168 -0
  65. vllm/benchmarks/serve.py +1135 -0
  66. vllm/benchmarks/throughput.py +609 -0
  67. vllm/benchmarks/utils.py +70 -0
  68. vllm/collect_env.py +820 -0
  69. vllm/compilation/__init__.py +0 -0
  70. vllm/compilation/activation_quant_fusion.py +89 -0
  71. vllm/compilation/backends.py +563 -0
  72. vllm/compilation/base_piecewise_backend.py +72 -0
  73. vllm/compilation/collective_fusion.py +127 -0
  74. vllm/compilation/compiler_interface.py +544 -0
  75. vllm/compilation/counter.py +38 -0
  76. vllm/compilation/cuda_piecewise_backend.py +214 -0
  77. vllm/compilation/decorators.py +250 -0
  78. vllm/compilation/fix_functionalization.py +191 -0
  79. vllm/compilation/fusion.py +618 -0
  80. vllm/compilation/fx_utils.py +62 -0
  81. vllm/compilation/inductor_pass.py +115 -0
  82. vllm/compilation/monitor.py +39 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +137 -0
  85. vllm/compilation/pass_manager.py +78 -0
  86. vllm/compilation/sequence_parallelism.py +268 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +67 -0
  89. vllm/compilation/wrapper.py +135 -0
  90. vllm/config.py +4746 -0
  91. vllm/connections.py +174 -0
  92. vllm/core/__init__.py +0 -0
  93. vllm/core/block/__init__.py +0 -0
  94. vllm/core/block/block_table.py +399 -0
  95. vllm/core/block/common.py +371 -0
  96. vllm/core/block/cpu_gpu_block_allocator.py +441 -0
  97. vllm/core/block/interfaces.py +319 -0
  98. vllm/core/block/naive_block.py +466 -0
  99. vllm/core/block/prefix_caching_block.py +1135 -0
  100. vllm/core/block/utils.py +28 -0
  101. vllm/core/block_manager.py +521 -0
  102. vllm/core/evictor.py +157 -0
  103. vllm/core/interfaces.py +135 -0
  104. vllm/core/placeholder_block_space_manager.py +100 -0
  105. vllm/core/scheduler.py +2093 -0
  106. vllm/device_allocator/__init__.py +0 -0
  107. vllm/device_allocator/cumem.py +281 -0
  108. vllm/distributed/__init__.py +6 -0
  109. vllm/distributed/communication_op.py +41 -0
  110. vllm/distributed/device_communicators/__init__.py +0 -0
  111. vllm/distributed/device_communicators/all2all.py +264 -0
  112. vllm/distributed/device_communicators/base_device_communicator.py +260 -0
  113. vllm/distributed/device_communicators/cpu_communicator.py +145 -0
  114. vllm/distributed/device_communicators/cuda_communicator.py +176 -0
  115. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  116. vllm/distributed/device_communicators/custom_all_reduce.py +304 -0
  117. vllm/distributed/device_communicators/custom_all_reduce_utils.py +259 -0
  118. vllm/distributed/device_communicators/hpu_communicator.py +46 -0
  119. vllm/distributed/device_communicators/neuron_communicator.py +20 -0
  120. vllm/distributed/device_communicators/pynccl.py +218 -0
  121. vllm/distributed/device_communicators/pynccl_wrapper.py +341 -0
  122. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  123. vllm/distributed/device_communicators/tpu_communicator.py +103 -0
  124. vllm/distributed/device_communicators/xpu_communicator.py +55 -0
  125. vllm/distributed/kv_events.py +356 -0
  126. vllm/distributed/kv_transfer/README.md +29 -0
  127. vllm/distributed/kv_transfer/__init__.py +12 -0
  128. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  129. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  130. vllm/distributed/kv_transfer/kv_connector/base.py +128 -0
  131. vllm/distributed/kv_transfer/kv_connector/factory.py +128 -0
  132. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +99 -0
  133. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +203 -0
  134. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +329 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +108 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +283 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +134 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +201 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1030 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +384 -0
  142. vllm/distributed/kv_transfer/kv_connector_agent.py +77 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  145. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  146. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  147. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  149. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +280 -0
  150. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  151. vllm/distributed/kv_transfer/kv_transfer_state.py +71 -0
  152. vllm/distributed/parallel_state.py +1296 -0
  153. vllm/distributed/tpu_distributed_utils.py +177 -0
  154. vllm/distributed/utils.py +536 -0
  155. vllm/engine/__init__.py +0 -0
  156. vllm/engine/arg_utils.py +1708 -0
  157. vllm/engine/async_llm_engine.py +1200 -0
  158. vllm/engine/async_timeout.py +173 -0
  159. vllm/engine/llm_engine.py +2097 -0
  160. vllm/engine/metrics.py +629 -0
  161. vllm/engine/metrics_types.py +94 -0
  162. vllm/engine/multiprocessing/__init__.py +148 -0
  163. vllm/engine/multiprocessing/client.py +681 -0
  164. vllm/engine/multiprocessing/engine.py +460 -0
  165. vllm/engine/output_processor/__init__.py +0 -0
  166. vllm/engine/output_processor/interfaces.py +75 -0
  167. vllm/engine/output_processor/multi_step.py +216 -0
  168. vllm/engine/output_processor/single_step.py +145 -0
  169. vllm/engine/output_processor/stop_checker.py +131 -0
  170. vllm/engine/output_processor/util.py +28 -0
  171. vllm/engine/protocol.py +317 -0
  172. vllm/entrypoints/__init__.py +0 -0
  173. vllm/entrypoints/api_server.py +178 -0
  174. vllm/entrypoints/chat_utils.py +1299 -0
  175. vllm/entrypoints/cli/__init__.py +0 -0
  176. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  177. vllm/entrypoints/cli/benchmark/base.py +39 -0
  178. vllm/entrypoints/cli/benchmark/latency.py +30 -0
  179. vllm/entrypoints/cli/benchmark/main.py +54 -0
  180. vllm/entrypoints/cli/benchmark/serve.py +30 -0
  181. vllm/entrypoints/cli/benchmark/throughput.py +30 -0
  182. vllm/entrypoints/cli/collect_env.py +35 -0
  183. vllm/entrypoints/cli/main.py +65 -0
  184. vllm/entrypoints/cli/openai.py +205 -0
  185. vllm/entrypoints/cli/run_batch.py +62 -0
  186. vllm/entrypoints/cli/serve.py +328 -0
  187. vllm/entrypoints/cli/types.py +25 -0
  188. vllm/entrypoints/launcher.py +147 -0
  189. vllm/entrypoints/llm.py +1544 -0
  190. vllm/entrypoints/logger.py +50 -0
  191. vllm/entrypoints/openai/__init__.py +0 -0
  192. vllm/entrypoints/openai/api_server.py +1387 -0
  193. vllm/entrypoints/openai/cli_args.py +315 -0
  194. vllm/entrypoints/openai/logits_processors.py +90 -0
  195. vllm/entrypoints/openai/protocol.py +1913 -0
  196. vllm/entrypoints/openai/run_batch.py +463 -0
  197. vllm/entrypoints/openai/serving_chat.py +1221 -0
  198. vllm/entrypoints/openai/serving_classification.py +160 -0
  199. vllm/entrypoints/openai/serving_completion.py +592 -0
  200. vllm/entrypoints/openai/serving_embedding.py +201 -0
  201. vllm/entrypoints/openai/serving_engine.py +986 -0
  202. vllm/entrypoints/openai/serving_models.py +315 -0
  203. vllm/entrypoints/openai/serving_pooling.py +232 -0
  204. vllm/entrypoints/openai/serving_score.py +433 -0
  205. vllm/entrypoints/openai/serving_tokenization.py +157 -0
  206. vllm/entrypoints/openai/serving_transcription.py +424 -0
  207. vllm/entrypoints/openai/tool_parsers/__init__.py +23 -0
  208. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  209. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  210. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  211. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  212. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +371 -0
  213. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  214. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  215. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  216. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +267 -0
  217. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  218. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  219. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  220. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  221. vllm/entrypoints/score_utils.py +50 -0
  222. vllm/entrypoints/ssl.py +75 -0
  223. vllm/entrypoints/utils.py +233 -0
  224. vllm/env_override.py +41 -0
  225. vllm/envs.py +944 -0
  226. vllm/executor/__init__.py +0 -0
  227. vllm/executor/executor_base.py +401 -0
  228. vllm/executor/mp_distributed_executor.py +244 -0
  229. vllm/executor/msgspec_utils.py +30 -0
  230. vllm/executor/multiproc_worker_utils.py +313 -0
  231. vllm/executor/ray_distributed_executor.py +701 -0
  232. vllm/executor/ray_utils.py +399 -0
  233. vllm/executor/uniproc_executor.py +139 -0
  234. vllm/forward_context.py +179 -0
  235. vllm/inputs/__init__.py +41 -0
  236. vllm/inputs/data.py +331 -0
  237. vllm/inputs/parse.py +151 -0
  238. vllm/inputs/preprocess.py +909 -0
  239. vllm/inputs/registry.py +237 -0
  240. vllm/jsontree.py +80 -0
  241. vllm/logger.py +212 -0
  242. vllm/logging_utils/__init__.py +8 -0
  243. vllm/logging_utils/dump_input.py +85 -0
  244. vllm/logging_utils/formatter.py +18 -0
  245. vllm/logits_process.py +119 -0
  246. vllm/lora/__init__.py +0 -0
  247. vllm/lora/fully_sharded_layers.py +355 -0
  248. vllm/lora/layers.py +1285 -0
  249. vllm/lora/lora.py +199 -0
  250. vllm/lora/models.py +818 -0
  251. vllm/lora/ops/__init__.py +0 -0
  252. vllm/lora/ops/torch_ops/__init__.py +16 -0
  253. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  254. vllm/lora/ops/triton_ops/__init__.py +12 -0
  255. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  256. vllm/lora/ops/triton_ops/lora_expand_op.py +290 -0
  257. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  258. vllm/lora/ops/triton_ops/lora_shrink_op.py +244 -0
  259. vllm/lora/ops/triton_ops/utils.py +120 -0
  260. vllm/lora/ops/xla_ops/__init__.py +7 -0
  261. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  262. vllm/lora/peft_helper.py +136 -0
  263. vllm/lora/punica_wrapper/__init__.py +10 -0
  264. vllm/lora/punica_wrapper/punica_base.py +485 -0
  265. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  266. vllm/lora/punica_wrapper/punica_gpu.py +290 -0
  267. vllm/lora/punica_wrapper/punica_hpu.py +145 -0
  268. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  269. vllm/lora/punica_wrapper/punica_tpu.py +405 -0
  270. vllm/lora/punica_wrapper/utils.py +164 -0
  271. vllm/lora/request.py +99 -0
  272. vllm/lora/resolver.py +85 -0
  273. vllm/lora/utils.py +240 -0
  274. vllm/lora/worker_manager.py +259 -0
  275. vllm/model_executor/__init__.py +16 -0
  276. vllm/model_executor/custom_op.py +152 -0
  277. vllm/model_executor/guided_decoding/__init__.py +181 -0
  278. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  279. vllm/model_executor/guided_decoding/guidance_logits_processors.py +104 -0
  280. vllm/model_executor/guided_decoding/guided_fields.py +41 -0
  281. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +67 -0
  282. vllm/model_executor/guided_decoding/outlines_decoding.py +155 -0
  283. vllm/model_executor/guided_decoding/outlines_logits_processors.py +284 -0
  284. vllm/model_executor/guided_decoding/utils.py +242 -0
  285. vllm/model_executor/guided_decoding/xgrammar_decoding.py +426 -0
  286. vllm/model_executor/layers/__init__.py +0 -0
  287. vllm/model_executor/layers/activation.py +369 -0
  288. vllm/model_executor/layers/fused_moe/__init__.py +54 -0
  289. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +125 -0
  290. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +117 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  455. vllm/model_executor/layers/fused_moe/cutlass_moe.py +461 -0
  456. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +240 -0
  457. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +240 -0
  458. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +186 -0
  459. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +775 -0
  460. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +232 -0
  461. vllm/model_executor/layers/fused_moe/fused_moe.py +1724 -0
  462. vllm/model_executor/layers/fused_moe/layer.py +1535 -0
  463. vllm/model_executor/layers/fused_moe/modular_kernel.py +446 -0
  464. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  465. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  466. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +190 -0
  467. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  468. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +159 -0
  469. vllm/model_executor/layers/fused_moe/prepare_finalize.py +69 -0
  470. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +421 -0
  471. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +117 -0
  472. vllm/model_executor/layers/fused_moe/utils.py +98 -0
  473. vllm/model_executor/layers/layernorm.py +288 -0
  474. vllm/model_executor/layers/lightning_attn.py +652 -0
  475. vllm/model_executor/layers/linear.py +1524 -0
  476. vllm/model_executor/layers/logits_processor.py +197 -0
  477. vllm/model_executor/layers/mamba/__init__.py +0 -0
  478. vllm/model_executor/layers/mamba/mamba2_metadata.py +125 -0
  479. vllm/model_executor/layers/mamba/mamba_mixer.py +245 -0
  480. vllm/model_executor/layers/mamba/mamba_mixer2.py +616 -0
  481. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  482. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +105 -0
  483. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  484. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  485. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +589 -0
  486. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  487. vllm/model_executor/layers/mamba/ops/ssd_combined.py +232 -0
  488. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +206 -0
  489. vllm/model_executor/layers/pooler.py +350 -0
  490. vllm/model_executor/layers/quantization/__init__.py +157 -0
  491. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  492. vllm/model_executor/layers/quantization/auto_round.py +310 -0
  493. vllm/model_executor/layers/quantization/awq.py +194 -0
  494. vllm/model_executor/layers/quantization/awq_marlin.py +519 -0
  495. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  496. vllm/model_executor/layers/quantization/base_config.py +151 -0
  497. vllm/model_executor/layers/quantization/bitblas.py +461 -0
  498. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  499. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  500. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +668 -0
  501. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1260 -0
  502. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +24 -0
  503. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +358 -0
  504. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  505. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  506. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +93 -0
  507. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +178 -0
  508. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  509. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +150 -0
  510. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  511. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  512. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  513. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  514. vllm/model_executor/layers/quantization/deepspeedfp.py +195 -0
  515. vllm/model_executor/layers/quantization/experts_int8.py +196 -0
  516. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  517. vllm/model_executor/layers/quantization/fp8.py +906 -0
  518. vllm/model_executor/layers/quantization/gguf.py +565 -0
  519. vllm/model_executor/layers/quantization/gptq.py +278 -0
  520. vllm/model_executor/layers/quantization/gptq_bitblas.py +445 -0
  521. vllm/model_executor/layers/quantization/gptq_marlin.py +648 -0
  522. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  523. vllm/model_executor/layers/quantization/hqq_marlin.py +332 -0
  524. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  525. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  526. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +90 -0
  527. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +83 -0
  528. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  529. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +300 -0
  530. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  531. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +120 -0
  532. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +131 -0
  533. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  534. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +87 -0
  535. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  536. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  537. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  538. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  539. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  540. vllm/model_executor/layers/quantization/marlin.py +261 -0
  541. vllm/model_executor/layers/quantization/modelopt.py +737 -0
  542. vllm/model_executor/layers/quantization/moe_wna16.py +449 -0
  543. vllm/model_executor/layers/quantization/neuron_quant.py +76 -0
  544. vllm/model_executor/layers/quantization/ptpc_fp8.py +127 -0
  545. vllm/model_executor/layers/quantization/qqq.py +275 -0
  546. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  547. vllm/model_executor/layers/quantization/quark/quark.py +441 -0
  548. vllm/model_executor/layers/quantization/quark/quark_moe.py +237 -0
  549. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  550. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  551. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +126 -0
  552. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +146 -0
  553. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  554. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  555. vllm/model_executor/layers/quantization/schema.py +86 -0
  556. vllm/model_executor/layers/quantization/torchao.py +161 -0
  557. vllm/model_executor/layers/quantization/tpu_int8.py +121 -0
  558. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  559. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  560. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +208 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/fp8_utils.py +618 -0
  764. vllm/model_executor/layers/quantization/utils/gptq_utils.py +95 -0
  765. vllm/model_executor/layers/quantization/utils/int8_utils.py +485 -0
  766. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  767. vllm/model_executor/layers/quantization/utils/machete_utils.py +33 -0
  768. vllm/model_executor/layers/quantization/utils/marlin_utils.py +476 -0
  769. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +283 -0
  770. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +325 -0
  771. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  772. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  773. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +126 -0
  774. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +45 -0
  775. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +104 -0
  776. vllm/model_executor/layers/quantization/utils/quant_utils.py +573 -0
  777. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +405 -0
  778. vllm/model_executor/layers/rejection_sampler.py +406 -0
  779. vllm/model_executor/layers/resampler.py +270 -0
  780. vllm/model_executor/layers/rotary_embedding.py +1862 -0
  781. vllm/model_executor/layers/sampler.py +1204 -0
  782. vllm/model_executor/layers/spec_decode_base_sampler.py +259 -0
  783. vllm/model_executor/layers/typical_acceptance_sampler.py +166 -0
  784. vllm/model_executor/layers/utils.py +95 -0
  785. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  786. vllm/model_executor/model_loader/__init__.py +76 -0
  787. vllm/model_executor/model_loader/base_loader.py +43 -0
  788. vllm/model_executor/model_loader/bitsandbytes_loader.py +570 -0
  789. vllm/model_executor/model_loader/default_loader.py +282 -0
  790. vllm/model_executor/model_loader/dummy_loader.py +27 -0
  791. vllm/model_executor/model_loader/gguf_loader.py +120 -0
  792. vllm/model_executor/model_loader/neuron.py +476 -0
  793. vllm/model_executor/model_loader/neuronx_distributed.py +685 -0
  794. vllm/model_executor/model_loader/runai_streamer_loader.py +109 -0
  795. vllm/model_executor/model_loader/sharded_state_loader.py +201 -0
  796. vllm/model_executor/model_loader/tensorizer.py +600 -0
  797. vllm/model_executor/model_loader/tensorizer_loader.py +123 -0
  798. vllm/model_executor/model_loader/tpu.py +112 -0
  799. vllm/model_executor/model_loader/utils.py +302 -0
  800. vllm/model_executor/model_loader/weight_utils.py +782 -0
  801. vllm/model_executor/models/__init__.py +28 -0
  802. vllm/model_executor/models/adapters.py +248 -0
  803. vllm/model_executor/models/aimv2.py +246 -0
  804. vllm/model_executor/models/arctic.py +559 -0
  805. vllm/model_executor/models/aria.py +657 -0
  806. vllm/model_executor/models/aya_vision.py +466 -0
  807. vllm/model_executor/models/baichuan.py +474 -0
  808. vllm/model_executor/models/bamba.py +543 -0
  809. vllm/model_executor/models/bart.py +938 -0
  810. vllm/model_executor/models/bert.py +523 -0
  811. vllm/model_executor/models/bert_with_rope.py +769 -0
  812. vllm/model_executor/models/blip.py +339 -0
  813. vllm/model_executor/models/blip2.py +718 -0
  814. vllm/model_executor/models/bloom.py +373 -0
  815. vllm/model_executor/models/chameleon.py +1136 -0
  816. vllm/model_executor/models/chatglm.py +478 -0
  817. vllm/model_executor/models/clip.py +407 -0
  818. vllm/model_executor/models/commandr.py +472 -0
  819. vllm/model_executor/models/constant_size_cache.py +137 -0
  820. vllm/model_executor/models/dbrx.py +472 -0
  821. vllm/model_executor/models/deepseek.py +486 -0
  822. vllm/model_executor/models/deepseek_mtp.py +269 -0
  823. vllm/model_executor/models/deepseek_v2.py +843 -0
  824. vllm/model_executor/models/deepseek_vl2.py +648 -0
  825. vllm/model_executor/models/eagle.py +260 -0
  826. vllm/model_executor/models/exaone.py +551 -0
  827. vllm/model_executor/models/fairseq2_llama.py +154 -0
  828. vllm/model_executor/models/falcon.py +510 -0
  829. vllm/model_executor/models/falcon_h1.py +685 -0
  830. vllm/model_executor/models/florence2.py +1103 -0
  831. vllm/model_executor/models/fuyu.py +389 -0
  832. vllm/model_executor/models/gemma.py +425 -0
  833. vllm/model_executor/models/gemma2.py +425 -0
  834. vllm/model_executor/models/gemma3.py +533 -0
  835. vllm/model_executor/models/gemma3_mm.py +709 -0
  836. vllm/model_executor/models/glm.py +23 -0
  837. vllm/model_executor/models/glm4.py +305 -0
  838. vllm/model_executor/models/glm4v.py +648 -0
  839. vllm/model_executor/models/gpt2.py +328 -0
  840. vllm/model_executor/models/gpt_bigcode.py +335 -0
  841. vllm/model_executor/models/gpt_j.py +339 -0
  842. vllm/model_executor/models/gpt_neox.py +332 -0
  843. vllm/model_executor/models/granite.py +493 -0
  844. vllm/model_executor/models/granite_speech.py +779 -0
  845. vllm/model_executor/models/granitemoe.py +437 -0
  846. vllm/model_executor/models/granitemoehybrid.py +586 -0
  847. vllm/model_executor/models/granitemoeshared.py +341 -0
  848. vllm/model_executor/models/gritlm.py +224 -0
  849. vllm/model_executor/models/grok1.py +546 -0
  850. vllm/model_executor/models/h2ovl.py +546 -0
  851. vllm/model_executor/models/idefics2_vision_model.py +389 -0
  852. vllm/model_executor/models/idefics3.py +776 -0
  853. vllm/model_executor/models/interfaces.py +572 -0
  854. vllm/model_executor/models/interfaces_base.py +164 -0
  855. vllm/model_executor/models/intern_vit.py +480 -0
  856. vllm/model_executor/models/internlm2.py +455 -0
  857. vllm/model_executor/models/internlm2_ve.py +147 -0
  858. vllm/model_executor/models/internvl.py +1418 -0
  859. vllm/model_executor/models/jais.py +373 -0
  860. vllm/model_executor/models/jamba.py +592 -0
  861. vllm/model_executor/models/kimi_vl.py +577 -0
  862. vllm/model_executor/models/llama.py +644 -0
  863. vllm/model_executor/models/llama4.py +532 -0
  864. vllm/model_executor/models/llama_eagle.py +165 -0
  865. vllm/model_executor/models/llama_eagle3.py +263 -0
  866. vllm/model_executor/models/llava.py +866 -0
  867. vllm/model_executor/models/llava_next.py +586 -0
  868. vllm/model_executor/models/llava_next_video.py +471 -0
  869. vllm/model_executor/models/llava_onevision.py +956 -0
  870. vllm/model_executor/models/mamba.py +273 -0
  871. vllm/model_executor/models/mamba2.py +308 -0
  872. vllm/model_executor/models/mamba_cache.py +76 -0
  873. vllm/model_executor/models/medusa.py +219 -0
  874. vllm/model_executor/models/mimo.py +192 -0
  875. vllm/model_executor/models/mimo_mtp.py +285 -0
  876. vllm/model_executor/models/minicpm.py +592 -0
  877. vllm/model_executor/models/minicpm3.py +230 -0
  878. vllm/model_executor/models/minicpm_eagle.py +391 -0
  879. vllm/model_executor/models/minicpmo.py +759 -0
  880. vllm/model_executor/models/minicpmv.py +1287 -0
  881. vllm/model_executor/models/minimax_cache.py +36 -0
  882. vllm/model_executor/models/minimax_text_01.py +1301 -0
  883. vllm/model_executor/models/minimax_vl_01.py +364 -0
  884. vllm/model_executor/models/mistral3.py +604 -0
  885. vllm/model_executor/models/mixtral.py +488 -0
  886. vllm/model_executor/models/mixtral_quant.py +453 -0
  887. vllm/model_executor/models/mllama.py +1624 -0
  888. vllm/model_executor/models/mllama4.py +938 -0
  889. vllm/model_executor/models/mlp_speculator.py +206 -0
  890. vllm/model_executor/models/modernbert.py +331 -0
  891. vllm/model_executor/models/module_mapping.py +72 -0
  892. vllm/model_executor/models/molmo.py +1568 -0
  893. vllm/model_executor/models/moonvit.py +630 -0
  894. vllm/model_executor/models/mpt.py +331 -0
  895. vllm/model_executor/models/nemotron.py +508 -0
  896. vllm/model_executor/models/nemotron_h.py +573 -0
  897. vllm/model_executor/models/nemotron_nas.py +484 -0
  898. vllm/model_executor/models/nvlm_d.py +216 -0
  899. vllm/model_executor/models/olmo.py +389 -0
  900. vllm/model_executor/models/olmo2.py +414 -0
  901. vllm/model_executor/models/olmoe.py +468 -0
  902. vllm/model_executor/models/opt.py +412 -0
  903. vllm/model_executor/models/orion.py +349 -0
  904. vllm/model_executor/models/ovis.py +567 -0
  905. vllm/model_executor/models/paligemma.py +398 -0
  906. vllm/model_executor/models/persimmon.py +344 -0
  907. vllm/model_executor/models/phi.py +356 -0
  908. vllm/model_executor/models/phi3.py +19 -0
  909. vllm/model_executor/models/phi3_small.py +465 -0
  910. vllm/model_executor/models/phi3v.py +723 -0
  911. vllm/model_executor/models/phi4mm.py +1246 -0
  912. vllm/model_executor/models/phi4mm_audio.py +1233 -0
  913. vllm/model_executor/models/phi4mm_utils.py +1884 -0
  914. vllm/model_executor/models/phimoe.py +665 -0
  915. vllm/model_executor/models/pixtral.py +1316 -0
  916. vllm/model_executor/models/plamo2.py +738 -0
  917. vllm/model_executor/models/prithvi_geospatial_mae.py +232 -0
  918. vllm/model_executor/models/qwen.py +362 -0
  919. vllm/model_executor/models/qwen2.py +497 -0
  920. vllm/model_executor/models/qwen2_5_omni_thinker.py +904 -0
  921. vllm/model_executor/models/qwen2_5_vl.py +1166 -0
  922. vllm/model_executor/models/qwen2_audio.py +410 -0
  923. vllm/model_executor/models/qwen2_moe.py +540 -0
  924. vllm/model_executor/models/qwen2_rm.py +132 -0
  925. vllm/model_executor/models/qwen2_vl.py +1405 -0
  926. vllm/model_executor/models/qwen3.py +321 -0
  927. vllm/model_executor/models/qwen3_moe.py +535 -0
  928. vllm/model_executor/models/qwen_vl.py +785 -0
  929. vllm/model_executor/models/registry.py +622 -0
  930. vllm/model_executor/models/roberta.py +276 -0
  931. vllm/model_executor/models/siglip.py +524 -0
  932. vllm/model_executor/models/skyworkr1v.py +951 -0
  933. vllm/model_executor/models/smolvlm.py +52 -0
  934. vllm/model_executor/models/solar.py +506 -0
  935. vllm/model_executor/models/stablelm.py +343 -0
  936. vllm/model_executor/models/starcoder2.py +356 -0
  937. vllm/model_executor/models/tarsier.py +643 -0
  938. vllm/model_executor/models/telechat2.py +140 -0
  939. vllm/model_executor/models/teleflm.py +79 -0
  940. vllm/model_executor/models/transformers.py +508 -0
  941. vllm/model_executor/models/ultravox.py +656 -0
  942. vllm/model_executor/models/utils.py +731 -0
  943. vllm/model_executor/models/vision.py +147 -0
  944. vllm/model_executor/models/whisper.py +747 -0
  945. vllm/model_executor/models/zamba2.py +1009 -0
  946. vllm/model_executor/parameter.py +459 -0
  947. vllm/model_executor/pooling_metadata.py +72 -0
  948. vllm/model_executor/sampling_metadata.py +597 -0
  949. vllm/model_executor/utils.py +77 -0
  950. vllm/multimodal/__init__.py +33 -0
  951. vllm/multimodal/audio.py +106 -0
  952. vllm/multimodal/base.py +219 -0
  953. vllm/multimodal/hasher.py +118 -0
  954. vllm/multimodal/image.py +97 -0
  955. vllm/multimodal/inputs.py +876 -0
  956. vllm/multimodal/parse.py +461 -0
  957. vllm/multimodal/processing.py +1895 -0
  958. vllm/multimodal/profiling.py +258 -0
  959. vllm/multimodal/registry.py +331 -0
  960. vllm/multimodal/utils.py +436 -0
  961. vllm/multimodal/video.py +198 -0
  962. vllm/outputs.py +512 -0
  963. vllm/platforms/__init__.py +291 -0
  964. vllm/platforms/cpu.py +266 -0
  965. vllm/platforms/cuda.py +526 -0
  966. vllm/platforms/hpu.py +106 -0
  967. vllm/platforms/interface.py +538 -0
  968. vllm/platforms/neuron.py +150 -0
  969. vllm/platforms/rocm.py +435 -0
  970. vllm/platforms/tpu.py +216 -0
  971. vllm/platforms/xpu.py +156 -0
  972. vllm/plugins/__init__.py +94 -0
  973. vllm/plugins/lora_resolvers/README.md +15 -0
  974. vllm/plugins/lora_resolvers/__init__.py +0 -0
  975. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  976. vllm/pooling_params.py +54 -0
  977. vllm/profiler/__init__.py +0 -0
  978. vllm/profiler/layerwise_profile.py +375 -0
  979. vllm/profiler/utils.py +148 -0
  980. vllm/prompt_adapter/__init__.py +0 -0
  981. vllm/prompt_adapter/layers.py +83 -0
  982. vllm/prompt_adapter/models.py +358 -0
  983. vllm/prompt_adapter/request.py +37 -0
  984. vllm/prompt_adapter/utils.py +98 -0
  985. vllm/prompt_adapter/worker_manager.py +179 -0
  986. vllm/py.typed +2 -0
  987. vllm/reasoning/__init__.py +15 -0
  988. vllm/reasoning/abs_reasoning_parsers.py +192 -0
  989. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  990. vllm/reasoning/granite_reasoning_parser.py +363 -0
  991. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  992. vllm/sampling_params.py +602 -0
  993. vllm/scalar_type.py +347 -0
  994. vllm/scripts.py +15 -0
  995. vllm/sequence.py +1568 -0
  996. vllm/spec_decode/__init__.py +0 -0
  997. vllm/spec_decode/batch_expansion.py +506 -0
  998. vllm/spec_decode/draft_model_runner.py +349 -0
  999. vllm/spec_decode/interfaces.py +99 -0
  1000. vllm/spec_decode/medusa_worker.py +138 -0
  1001. vllm/spec_decode/metrics.py +213 -0
  1002. vllm/spec_decode/mlp_speculator_worker.py +94 -0
  1003. vllm/spec_decode/mqa_scorer.py +160 -0
  1004. vllm/spec_decode/multi_step_worker.py +423 -0
  1005. vllm/spec_decode/ngram_worker.py +196 -0
  1006. vllm/spec_decode/proposer_worker_base.py +59 -0
  1007. vllm/spec_decode/smaller_tp_proposer_worker.py +196 -0
  1008. vllm/spec_decode/spec_decode_worker.py +1326 -0
  1009. vllm/spec_decode/target_model_runner.py +45 -0
  1010. vllm/spec_decode/top1_proposer.py +275 -0
  1011. vllm/spec_decode/util.py +277 -0
  1012. vllm/test_utils.py +130 -0
  1013. vllm/third_party/__init__.py +0 -0
  1014. vllm/third_party/pynvml.py +6140 -0
  1015. vllm/tracing.py +131 -0
  1016. vllm/transformers_utils/__init__.py +24 -0
  1017. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1018. vllm/transformers_utils/chat_templates/registry.py +60 -0
  1019. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1020. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1021. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1022. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1023. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1024. vllm/transformers_utils/config.py +887 -0
  1025. vllm/transformers_utils/configs/__init__.py +61 -0
  1026. vllm/transformers_utils/configs/arctic.py +207 -0
  1027. vllm/transformers_utils/configs/chatglm.py +72 -0
  1028. vllm/transformers_utils/configs/cohere2.py +195 -0
  1029. vllm/transformers_utils/configs/dbrx.py +280 -0
  1030. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1031. vllm/transformers_utils/configs/eagle.py +85 -0
  1032. vllm/transformers_utils/configs/exaone.py +190 -0
  1033. vllm/transformers_utils/configs/falcon.py +90 -0
  1034. vllm/transformers_utils/configs/h2ovl.py +16 -0
  1035. vllm/transformers_utils/configs/internvl.py +54 -0
  1036. vllm/transformers_utils/configs/jais.py +238 -0
  1037. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1038. vllm/transformers_utils/configs/medusa.py +63 -0
  1039. vllm/transformers_utils/configs/minimax_text_01.py +70 -0
  1040. vllm/transformers_utils/configs/minimax_vl_01.py +71 -0
  1041. vllm/transformers_utils/configs/mllama.py +31 -0
  1042. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1043. vllm/transformers_utils/configs/moonvit.py +33 -0
  1044. vllm/transformers_utils/configs/mpt.py +180 -0
  1045. vllm/transformers_utils/configs/nemotron.py +205 -0
  1046. vllm/transformers_utils/configs/nemotron_h.py +258 -0
  1047. vllm/transformers_utils/configs/nvlm_d.py +15 -0
  1048. vllm/transformers_utils/configs/ovis.py +184 -0
  1049. vllm/transformers_utils/configs/skyworkr1v.py +54 -0
  1050. vllm/transformers_utils/configs/solar.py +247 -0
  1051. vllm/transformers_utils/configs/telechat2.py +64 -0
  1052. vllm/transformers_utils/configs/ultravox.py +108 -0
  1053. vllm/transformers_utils/detokenizer.py +168 -0
  1054. vllm/transformers_utils/detokenizer_utils.py +189 -0
  1055. vllm/transformers_utils/processor.py +221 -0
  1056. vllm/transformers_utils/processors/__init__.py +8 -0
  1057. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1058. vllm/transformers_utils/processors/ovis.py +420 -0
  1059. vllm/transformers_utils/s3_utils.py +162 -0
  1060. vllm/transformers_utils/tokenizer.py +302 -0
  1061. vllm/transformers_utils/tokenizer_base.py +149 -0
  1062. vllm/transformers_utils/tokenizer_group.py +120 -0
  1063. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1064. vllm/transformers_utils/tokenizers/mistral.py +493 -0
  1065. vllm/transformers_utils/utils.py +99 -0
  1066. vllm/triton_utils/__init__.py +14 -0
  1067. vllm/triton_utils/importing.py +50 -0
  1068. vllm/usage/__init__.py +0 -0
  1069. vllm/usage/usage_lib.py +256 -0
  1070. vllm/utils.py +2910 -0
  1071. vllm/v1/__init__.py +0 -0
  1072. vllm/v1/attention/__init__.py +0 -0
  1073. vllm/v1/attention/backends/__init__.py +0 -0
  1074. vllm/v1/attention/backends/cpu_attn.py +163 -0
  1075. vllm/v1/attention/backends/flash_attn.py +869 -0
  1076. vllm/v1/attention/backends/flashinfer.py +651 -0
  1077. vllm/v1/attention/backends/flex_attention.py +477 -0
  1078. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1079. vllm/v1/attention/backends/mla/common.py +931 -0
  1080. vllm/v1/attention/backends/mla/cutlass_mla.py +97 -0
  1081. vllm/v1/attention/backends/mla/flashmla.py +152 -0
  1082. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +220 -0
  1083. vllm/v1/attention/backends/mla/triton_mla.py +120 -0
  1084. vllm/v1/attention/backends/pallas.py +240 -0
  1085. vllm/v1/attention/backends/triton_attn.py +285 -0
  1086. vllm/v1/attention/backends/utils.py +52 -0
  1087. vllm/v1/core/__init__.py +0 -0
  1088. vllm/v1/core/block_pool.py +349 -0
  1089. vllm/v1/core/encoder_cache_manager.py +150 -0
  1090. vllm/v1/core/kv_cache_coordinator.py +363 -0
  1091. vllm/v1/core/kv_cache_manager.py +392 -0
  1092. vllm/v1/core/kv_cache_utils.py +996 -0
  1093. vllm/v1/core/sched/__init__.py +0 -0
  1094. vllm/v1/core/sched/interface.py +150 -0
  1095. vllm/v1/core/sched/output.py +154 -0
  1096. vllm/v1/core/sched/scheduler.py +1044 -0
  1097. vllm/v1/core/sched/utils.py +23 -0
  1098. vllm/v1/core/single_type_kv_cache_manager.py +403 -0
  1099. vllm/v1/engine/__init__.py +173 -0
  1100. vllm/v1/engine/async_llm.py +558 -0
  1101. vllm/v1/engine/coordinator.py +253 -0
  1102. vllm/v1/engine/core.py +961 -0
  1103. vllm/v1/engine/core_client.py +1129 -0
  1104. vllm/v1/engine/detokenizer.py +261 -0
  1105. vllm/v1/engine/exceptions.py +17 -0
  1106. vllm/v1/engine/llm_engine.py +317 -0
  1107. vllm/v1/engine/logprobs.py +199 -0
  1108. vllm/v1/engine/mm_input_cache.py +91 -0
  1109. vllm/v1/engine/output_processor.py +428 -0
  1110. vllm/v1/engine/parallel_sampling.py +133 -0
  1111. vllm/v1/engine/processor.py +407 -0
  1112. vllm/v1/executor/__init__.py +0 -0
  1113. vllm/v1/executor/abstract.py +113 -0
  1114. vllm/v1/executor/multiproc_executor.py +537 -0
  1115. vllm/v1/executor/ray_distributed_executor.py +62 -0
  1116. vllm/v1/kv_cache_interface.py +194 -0
  1117. vllm/v1/metrics/__init__.py +0 -0
  1118. vllm/v1/metrics/loggers.py +523 -0
  1119. vllm/v1/metrics/prometheus.py +82 -0
  1120. vllm/v1/metrics/ray_wrappers.py +131 -0
  1121. vllm/v1/metrics/reader.py +246 -0
  1122. vllm/v1/metrics/stats.py +239 -0
  1123. vllm/v1/outputs.py +116 -0
  1124. vllm/v1/request.py +193 -0
  1125. vllm/v1/sample/__init__.py +0 -0
  1126. vllm/v1/sample/metadata.py +44 -0
  1127. vllm/v1/sample/ops/__init__.py +0 -0
  1128. vllm/v1/sample/ops/bad_words.py +39 -0
  1129. vllm/v1/sample/ops/penalties.py +59 -0
  1130. vllm/v1/sample/ops/topk_topp_sampler.py +293 -0
  1131. vllm/v1/sample/rejection_sampler.py +631 -0
  1132. vllm/v1/sample/sampler.py +286 -0
  1133. vllm/v1/sample/tpu/__init__.py +0 -0
  1134. vllm/v1/sample/tpu/metadata.py +124 -0
  1135. vllm/v1/sample/tpu/sampler.py +145 -0
  1136. vllm/v1/serial_utils.py +315 -0
  1137. vllm/v1/spec_decode/__init__.py +0 -0
  1138. vllm/v1/spec_decode/eagle.py +432 -0
  1139. vllm/v1/spec_decode/medusa.py +62 -0
  1140. vllm/v1/spec_decode/metadata.py +62 -0
  1141. vllm/v1/spec_decode/metrics.py +178 -0
  1142. vllm/v1/spec_decode/ngram_proposer.py +132 -0
  1143. vllm/v1/spec_decode/utils.py +46 -0
  1144. vllm/v1/structured_output/__init__.py +222 -0
  1145. vllm/v1/structured_output/backend_guidance.py +245 -0
  1146. vllm/v1/structured_output/backend_types.py +134 -0
  1147. vllm/v1/structured_output/backend_xgrammar.py +318 -0
  1148. vllm/v1/structured_output/request.py +86 -0
  1149. vllm/v1/structured_output/utils.py +175 -0
  1150. vllm/v1/utils.py +743 -0
  1151. vllm/v1/worker/__init__.py +0 -0
  1152. vllm/v1/worker/block_table.py +142 -0
  1153. vllm/v1/worker/cpu_model_runner.py +86 -0
  1154. vllm/v1/worker/cpu_worker.py +152 -0
  1155. vllm/v1/worker/gpu_input_batch.py +681 -0
  1156. vllm/v1/worker/gpu_model_runner.py +2320 -0
  1157. vllm/v1/worker/gpu_worker.py +393 -0
  1158. vllm/v1/worker/lora_model_runner_mixin.py +173 -0
  1159. vllm/v1/worker/tpu_model_runner.py +1673 -0
  1160. vllm/v1/worker/tpu_worker.py +299 -0
  1161. vllm/v1/worker/utils.py +111 -0
  1162. vllm/v1/worker/worker_base.py +65 -0
  1163. vllm/version.py +41 -0
  1164. vllm/vllm_flash_attn/.gitkeep +0 -0
  1165. vllm/worker/__init__.py +0 -0
  1166. vllm/worker/cache_engine.py +145 -0
  1167. vllm/worker/cpu_enc_dec_model_runner.py +326 -0
  1168. vllm/worker/cpu_model_runner.py +671 -0
  1169. vllm/worker/cpu_pooling_model_runner.py +125 -0
  1170. vllm/worker/cpu_worker.py +450 -0
  1171. vllm/worker/enc_dec_model_runner.py +555 -0
  1172. vllm/worker/hpu_model_runner.py +2320 -0
  1173. vllm/worker/hpu_worker.py +484 -0
  1174. vllm/worker/model_runner.py +2178 -0
  1175. vllm/worker/model_runner_base.py +282 -0
  1176. vllm/worker/multi_step_hpu_worker.py +123 -0
  1177. vllm/worker/multi_step_model_runner.py +911 -0
  1178. vllm/worker/multi_step_neuron_model_runner.py +84 -0
  1179. vllm/worker/multi_step_neuronx_distributed_model_runner.py +63 -0
  1180. vllm/worker/multi_step_tpu_worker.py +108 -0
  1181. vllm/worker/multi_step_worker.py +197 -0
  1182. vllm/worker/neuron_model_runner.py +460 -0
  1183. vllm/worker/neuron_worker.py +193 -0
  1184. vllm/worker/neuronx_distributed_model_runner.py +294 -0
  1185. vllm/worker/pooling_model_runner.py +211 -0
  1186. vllm/worker/tpu_model_runner.py +909 -0
  1187. vllm/worker/tpu_worker.py +337 -0
  1188. vllm/worker/utils.py +53 -0
  1189. vllm/worker/worker.py +577 -0
  1190. vllm/worker/worker_base.py +646 -0
  1191. vllm/worker/xpu_model_runner.py +606 -0
  1192. vllm/worker/xpu_worker.py +186 -0
  1193. vllm_cpu_amxbf16-0.9.1.dist-info/METADATA +305 -0
  1194. vllm_cpu_amxbf16-0.9.1.dist-info/RECORD +1197 -0
  1195. vllm_cpu_amxbf16-0.9.1.dist-info/WHEEL +5 -0
  1196. vllm_cpu_amxbf16-0.9.1.dist-info/entry_points.txt +5 -0
  1197. vllm_cpu_amxbf16-0.9.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1913 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Adapted from
5
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
6
+ import json
7
+ import time
8
+ from http import HTTPStatus
9
+ from typing import Annotated, Any, ClassVar, Literal, Optional, Union
10
+
11
+ import regex as re
12
+ import torch
13
+ from fastapi import HTTPException, UploadFile
14
+ from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
15
+ ValidationInfo, field_validator, model_validator)
16
+ from typing_extensions import TypeAlias
17
+
18
+ from vllm import envs
19
+ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
20
+ random_tool_call_id)
21
+ from vllm.logger import init_logger
22
+ from vllm.pooling_params import PoolingParams
23
+ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
24
+ RequestOutputKind, SamplingParams)
25
+ from vllm.sequence import Logprob
26
+ from vllm.utils import random_uuid, resolve_obj_by_qualname
27
+
28
+ logger = init_logger(__name__)
29
+
30
+ _LONG_INFO = torch.iinfo(torch.long)
31
+
32
+
33
+ class OpenAIBaseModel(BaseModel):
34
+ # OpenAI API does allow extra fields
35
+ model_config = ConfigDict(extra="allow")
36
+
37
+ # Cache class field names
38
+ field_names: ClassVar[Optional[set[str]]] = None
39
+
40
+ @model_validator(mode="wrap")
41
+ @classmethod
42
+ def __log_extra_fields__(cls, data, handler):
43
+ result = handler(data)
44
+ if not isinstance(data, dict):
45
+ return result
46
+ field_names = cls.field_names
47
+ if field_names is None:
48
+ # Get all class field names and their potential aliases
49
+ field_names = set()
50
+ for field_name, field in cls.model_fields.items():
51
+ field_names.add(field_name)
52
+ if alias := getattr(field, "alias", None):
53
+ field_names.add(alias)
54
+ cls.field_names = field_names
55
+
56
+ # Compare against both field names and aliases
57
+ if any(k not in field_names for k in data):
58
+ logger.warning(
59
+ "The following fields were present in the request "
60
+ "but ignored: %s",
61
+ data.keys() - field_names,
62
+ )
63
+ return result
64
+
65
+
66
+ class ErrorResponse(OpenAIBaseModel):
67
+ object: str = "error"
68
+ message: str
69
+ type: str
70
+ param: Optional[str] = None
71
+ code: int
72
+
73
+
74
+ class ModelPermission(OpenAIBaseModel):
75
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
76
+ object: str = "model_permission"
77
+ created: int = Field(default_factory=lambda: int(time.time()))
78
+ allow_create_engine: bool = False
79
+ allow_sampling: bool = True
80
+ allow_logprobs: bool = True
81
+ allow_search_indices: bool = False
82
+ allow_view: bool = True
83
+ allow_fine_tuning: bool = False
84
+ organization: str = "*"
85
+ group: Optional[str] = None
86
+ is_blocking: bool = False
87
+
88
+
89
+ class ModelCard(OpenAIBaseModel):
90
+ id: str
91
+ object: str = "model"
92
+ created: int = Field(default_factory=lambda: int(time.time()))
93
+ owned_by: str = "vllm"
94
+ root: Optional[str] = None
95
+ parent: Optional[str] = None
96
+ max_model_len: Optional[int] = None
97
+ permission: list[ModelPermission] = Field(default_factory=list)
98
+
99
+
100
+ class ModelList(OpenAIBaseModel):
101
+ object: str = "list"
102
+ data: list[ModelCard] = Field(default_factory=list)
103
+
104
+
105
+ class PromptTokenUsageInfo(OpenAIBaseModel):
106
+ cached_tokens: Optional[int] = None
107
+
108
+
109
+ class UsageInfo(OpenAIBaseModel):
110
+ prompt_tokens: int = 0
111
+ total_tokens: int = 0
112
+ completion_tokens: Optional[int] = 0
113
+ prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
114
+
115
+
116
+ class RequestResponseMetadata(BaseModel):
117
+ request_id: str
118
+ final_usage_info: Optional[UsageInfo] = None
119
+
120
+
121
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
122
+ name: str
123
+ description: Optional[str] = None
124
+ # schema is the field in openai but that causes conflicts with pydantic so
125
+ # instead use json_schema with an alias
126
+ json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
127
+ strict: Optional[bool] = None
128
+
129
+
130
+ class StructuralTag(OpenAIBaseModel):
131
+ begin: str
132
+ # schema is the field, but that causes conflicts with pydantic so
133
+ # instead use structural_tag_schema with an alias
134
+ structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
135
+ alias="schema")
136
+ end: str
137
+
138
+
139
+ class StructuralTagResponseFormat(OpenAIBaseModel):
140
+ type: Literal["structural_tag"]
141
+ structures: list[StructuralTag]
142
+ triggers: list[str]
143
+
144
+
145
+ class ResponseFormat(OpenAIBaseModel):
146
+ # type must be "json_schema", "json_object", or "text"
147
+ type: Literal["text", "json_object", "json_schema"]
148
+ json_schema: Optional[JsonSchemaResponseFormat] = None
149
+
150
+
151
+ AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
152
+
153
+
154
+ class StreamOptions(OpenAIBaseModel):
155
+ include_usage: Optional[bool] = True
156
+ continuous_usage_stats: Optional[bool] = False
157
+
158
+
159
+ class FunctionDefinition(OpenAIBaseModel):
160
+ name: str
161
+ description: Optional[str] = None
162
+ parameters: Optional[dict[str, Any]] = None
163
+
164
+
165
+ class ChatCompletionToolsParam(OpenAIBaseModel):
166
+ type: Literal["function"] = "function"
167
+ function: FunctionDefinition
168
+
169
+
170
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
171
+ name: str
172
+
173
+
174
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
175
+ function: ChatCompletionNamedFunction
176
+ type: Literal["function"] = "function"
177
+
178
+
179
+ # extra="forbid" is a workaround to have kwargs as a field,
180
+ # see https://github.com/pydantic/pydantic/issues/3125
181
+ class LogitsProcessorConstructor(BaseModel):
182
+ qualname: str
183
+ args: Optional[list[Any]] = None
184
+ kwargs: Optional[dict[str, Any]] = None
185
+
186
+ model_config = ConfigDict(extra="forbid")
187
+
188
+
189
+ LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
190
+
191
+
192
+ def get_logits_processors(processors: Optional[LogitsProcessors],
193
+ pattern: Optional[str]) -> Optional[list[Any]]:
194
+ if processors and pattern:
195
+ logits_processors = []
196
+ for processor in processors:
197
+ qualname = processor if isinstance(processor,
198
+ str) else processor.qualname
199
+ if not re.match(pattern, qualname):
200
+ raise ValueError(
201
+ f"Logits processor '{qualname}' is not allowed by this "
202
+ "server. See --logits-processor-pattern engine argument "
203
+ "for more information.")
204
+ try:
205
+ logits_processor = resolve_obj_by_qualname(qualname)
206
+ except Exception as e:
207
+ raise ValueError(
208
+ f"Logits processor '{qualname}' could not be resolved: {e}"
209
+ ) from e
210
+ if isinstance(processor, LogitsProcessorConstructor):
211
+ logits_processor = logits_processor(*processor.args or [],
212
+ **processor.kwargs or {})
213
+ logits_processors.append(logits_processor)
214
+ return logits_processors
215
+ elif processors:
216
+ raise ValueError(
217
+ "The `logits_processors` argument is not supported by this "
218
+ "server. See --logits-processor-pattern engine argugment "
219
+ "for more information.")
220
+ return None
221
+
222
+
223
+ class ChatCompletionRequest(OpenAIBaseModel):
224
+ # Ordered by official OpenAI API documentation
225
+ # https://platform.openai.com/docs/api-reference/chat/create
226
+ messages: list[ChatCompletionMessageParam]
227
+ model: Optional[str] = None
228
+ frequency_penalty: Optional[float] = 0.0
229
+ logit_bias: Optional[dict[str, float]] = None
230
+ logprobs: Optional[bool] = False
231
+ top_logprobs: Optional[int] = 0
232
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
233
+ max_tokens: Optional[int] = Field(
234
+ default=None,
235
+ deprecated=
236
+ 'max_tokens is deprecated in favor of the max_completion_tokens field')
237
+ max_completion_tokens: Optional[int] = None
238
+ n: Optional[int] = 1
239
+ presence_penalty: Optional[float] = 0.0
240
+ response_format: Optional[AnyResponseFormat] = None
241
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
242
+ stop: Optional[Union[str, list[str]]] = []
243
+ stream: Optional[bool] = False
244
+ stream_options: Optional[StreamOptions] = None
245
+ temperature: Optional[float] = None
246
+ top_p: Optional[float] = None
247
+ tools: Optional[list[ChatCompletionToolsParam]] = None
248
+ tool_choice: Optional[Union[
249
+ Literal["none"],
250
+ Literal["auto"],
251
+ Literal["required"],
252
+ ChatCompletionNamedToolChoiceParam,
253
+ ]] = "none"
254
+
255
+ # NOTE this will be ignored by vLLM -- the model determines the behavior
256
+ parallel_tool_calls: Optional[bool] = False
257
+ user: Optional[str] = None
258
+
259
+ # --8<-- [start:chat-completion-sampling-params]
260
+ best_of: Optional[int] = None
261
+ use_beam_search: bool = False
262
+ top_k: Optional[int] = None
263
+ min_p: Optional[float] = None
264
+ repetition_penalty: Optional[float] = None
265
+ length_penalty: float = 1.0
266
+ stop_token_ids: Optional[list[int]] = []
267
+ include_stop_str_in_output: bool = False
268
+ ignore_eos: bool = False
269
+ min_tokens: int = 0
270
+ skip_special_tokens: bool = True
271
+ spaces_between_special_tokens: bool = True
272
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
273
+ prompt_logprobs: Optional[int] = None
274
+ allowed_token_ids: Optional[list[int]] = None
275
+ # --8<-- [end:chat-completion-sampling-params]
276
+
277
+ # --8<-- [start:chat-completion-extra-params]
278
+ echo: bool = Field(
279
+ default=False,
280
+ description=(
281
+ "If true, the new message will be prepended with the last message "
282
+ "if they belong to the same role."),
283
+ )
284
+ add_generation_prompt: bool = Field(
285
+ default=True,
286
+ description=
287
+ ("If true, the generation prompt will be added to the chat template. "
288
+ "This is a parameter used by chat template in tokenizer config of the "
289
+ "model."),
290
+ )
291
+ continue_final_message: bool = Field(
292
+ default=False,
293
+ description=
294
+ ("If this is set, the chat will be formatted so that the final "
295
+ "message in the chat is open-ended, without any EOS tokens. The "
296
+ "model will continue this message rather than starting a new one. "
297
+ "This allows you to \"prefill\" part of the model's response for it. "
298
+ "Cannot be used at the same time as `add_generation_prompt`."),
299
+ )
300
+ add_special_tokens: bool = Field(
301
+ default=False,
302
+ description=(
303
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
304
+ "on top of what is added by the chat template. "
305
+ "For most models, the chat template takes care of adding the "
306
+ "special tokens so this should be set to false (as is the "
307
+ "default)."),
308
+ )
309
+ documents: Optional[list[dict[str, str]]] = Field(
310
+ default=None,
311
+ description=
312
+ ("A list of dicts representing documents that will be accessible to "
313
+ "the model if it is performing RAG (retrieval-augmented generation)."
314
+ " If the template does not support RAG, this argument will have no "
315
+ "effect. We recommend that each document should be a dict containing "
316
+ "\"title\" and \"text\" keys."),
317
+ )
318
+ chat_template: Optional[str] = Field(
319
+ default=None,
320
+ description=(
321
+ "A Jinja template to use for this conversion. "
322
+ "As of transformers v4.44, default chat template is no longer "
323
+ "allowed, so you must provide a chat template if the tokenizer "
324
+ "does not define one."),
325
+ )
326
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
327
+ default=None,
328
+ description=("Additional kwargs to pass to the template renderer. "
329
+ "Will be accessible by the chat template."),
330
+ )
331
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
332
+ default=None,
333
+ description=("Additional kwargs to pass to the HF processor."),
334
+ )
335
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
336
+ default=None,
337
+ description=("If specified, the output will follow the JSON schema."),
338
+ )
339
+ guided_regex: Optional[str] = Field(
340
+ default=None,
341
+ description=(
342
+ "If specified, the output will follow the regex pattern."),
343
+ )
344
+ guided_choice: Optional[list[str]] = Field(
345
+ default=None,
346
+ description=(
347
+ "If specified, the output will be exactly one of the choices."),
348
+ )
349
+ guided_grammar: Optional[str] = Field(
350
+ default=None,
351
+ description=(
352
+ "If specified, the output will follow the context free grammar."),
353
+ )
354
+ structural_tag: Optional[str] = Field(
355
+ default=None,
356
+ description=(
357
+ "If specified, the output will follow the structural tag schema."),
358
+ )
359
+ guided_decoding_backend: Optional[str] = Field(
360
+ default=None,
361
+ description=(
362
+ "If specified, will override the default guided decoding backend "
363
+ "of the server for this specific request. If set, must be either "
364
+ "'outlines' / 'lm-format-enforcer'"),
365
+ )
366
+ guided_whitespace_pattern: Optional[str] = Field(
367
+ default=None,
368
+ description=(
369
+ "If specified, will override the default whitespace pattern "
370
+ "for guided json decoding."),
371
+ )
372
+ priority: int = Field(
373
+ default=0,
374
+ description=(
375
+ "The priority of the request (lower means earlier handling; "
376
+ "default: 0). Any priority other than 0 will raise an error "
377
+ "if the served model does not use priority scheduling."),
378
+ )
379
+ request_id: str = Field(
380
+ default_factory=lambda: f"{random_uuid()}",
381
+ description=(
382
+ "The request_id related to this request. If the caller does "
383
+ "not set it, a random_uuid will be generated. This id is used "
384
+ "through out the inference process and return in response."),
385
+ )
386
+ logits_processors: Optional[LogitsProcessors] = Field(
387
+ default=None,
388
+ description=(
389
+ "A list of either qualified names of logits processors, or "
390
+ "constructor objects, to apply when sampling. A constructor is "
391
+ "a JSON object with a required 'qualname' field specifying the "
392
+ "qualified name of the processor class/factory, and optional "
393
+ "'args' and 'kwargs' fields containing positional and keyword "
394
+ "arguments. For example: {'qualname': "
395
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
396
+ "{'param': 'value'}}."))
397
+ return_tokens_as_token_ids: Optional[bool] = Field(
398
+ default=None,
399
+ description=(
400
+ "If specified with 'logprobs', tokens are represented "
401
+ " as strings of the form 'token_id:{token_id}' so that tokens "
402
+ "that are not JSON-encodable can be identified."))
403
+ cache_salt: Optional[str] = Field(
404
+ default=None,
405
+ description=(
406
+ "If specified, the prefix cache will be salted with the provided "
407
+ "string to prevent an attacker to guess prompts in multi-user "
408
+ "environments. The salt should be random, protected from "
409
+ "access by 3rd parties, and long enough to be "
410
+ "unpredictable (e.g., 43 characters base64-encoded, corresponding "
411
+ "to 256 bit). Not supported by vLLM engine V0."))
412
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
413
+ default=None,
414
+ description="KVTransfer parameters used for disaggregated serving.")
415
+
416
+ # --8<-- [end:chat-completion-extra-params]
417
+
418
+ # Default sampling parameters for chat completion requests
419
+ _DEFAULT_SAMPLING_PARAMS: dict = {
420
+ "repetition_penalty": 1.0,
421
+ "temperature": 1.0,
422
+ "top_p": 1.0,
423
+ "top_k": 0,
424
+ "min_p": 0.0,
425
+ }
426
+
427
+ def to_beam_search_params(
428
+ self,
429
+ default_max_tokens: int,
430
+ default_sampling_params: Optional[dict] = None
431
+ ) -> BeamSearchParams:
432
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
433
+ max_tokens = self.max_completion_tokens or self.max_tokens
434
+
435
+ if default_sampling_params is None:
436
+ default_sampling_params = {}
437
+ n = self.n if self.n is not None else 1
438
+
439
+ # Use minimum of context window, user request & server limit.
440
+ max_tokens = min(
441
+ val for val in (default_max_tokens, max_tokens,
442
+ default_sampling_params.get("max_tokens", None))
443
+ if val is not None)
444
+
445
+ if (temperature := self.temperature) is None:
446
+ temperature = default_sampling_params.get(
447
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
448
+
449
+ return BeamSearchParams(
450
+ beam_width=n,
451
+ max_tokens=max_tokens,
452
+ ignore_eos=self.ignore_eos,
453
+ temperature=temperature,
454
+ length_penalty=self.length_penalty,
455
+ include_stop_str_in_output=self.include_stop_str_in_output,
456
+ )
457
+
458
+ def to_sampling_params(
459
+ self,
460
+ default_max_tokens: int,
461
+ logits_processor_pattern: Optional[str],
462
+ default_sampling_params: Optional[dict] = None,
463
+ ) -> SamplingParams:
464
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
465
+ max_tokens = self.max_completion_tokens or self.max_tokens
466
+
467
+ if default_sampling_params is None:
468
+ default_sampling_params = {}
469
+
470
+ # Use minimum of context window, user request & server limit.
471
+ max_tokens = min(
472
+ val for val in (default_max_tokens, max_tokens,
473
+ default_sampling_params.get("max_tokens", None))
474
+ if val is not None)
475
+
476
+ # Default parameters
477
+ if (repetition_penalty := self.repetition_penalty) is None:
478
+ repetition_penalty = default_sampling_params.get(
479
+ "repetition_penalty",
480
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
481
+ )
482
+ if (temperature := self.temperature) is None:
483
+ temperature = default_sampling_params.get(
484
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
485
+ if (top_p := self.top_p) is None:
486
+ top_p = default_sampling_params.get(
487
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
488
+ if (top_k := self.top_k) is None:
489
+ top_k = default_sampling_params.get(
490
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
491
+ if (min_p := self.min_p) is None:
492
+ min_p = default_sampling_params.get(
493
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
494
+
495
+ prompt_logprobs = self.prompt_logprobs
496
+ if prompt_logprobs is None and self.echo:
497
+ prompt_logprobs = self.top_logprobs
498
+
499
+ guided_json_object = None
500
+ if self.response_format is not None:
501
+ if self.response_format.type == "json_object":
502
+ guided_json_object = True
503
+ elif self.response_format.type == "json_schema":
504
+ json_schema = self.response_format.json_schema
505
+ assert json_schema is not None
506
+ self.guided_json = json_schema.json_schema
507
+ elif self.response_format.type == "structural_tag":
508
+ structural_tag = self.response_format
509
+ assert structural_tag is not None and isinstance(
510
+ structural_tag, StructuralTagResponseFormat)
511
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
512
+ self.structural_tag = json.dumps(s_tag_obj)
513
+
514
+ guided_decoding = GuidedDecodingParams.from_optional(
515
+ json=self._get_guided_json_from_tool() or self.guided_json,
516
+ regex=self.guided_regex,
517
+ choice=self.guided_choice,
518
+ grammar=self.guided_grammar,
519
+ json_object=guided_json_object,
520
+ backend=self.guided_decoding_backend,
521
+ whitespace_pattern=self.guided_whitespace_pattern,
522
+ structural_tag=self.structural_tag,
523
+ )
524
+
525
+ return SamplingParams.from_optional(
526
+ n=self.n,
527
+ best_of=self.best_of,
528
+ presence_penalty=self.presence_penalty,
529
+ frequency_penalty=self.frequency_penalty,
530
+ repetition_penalty=repetition_penalty,
531
+ temperature=temperature,
532
+ top_p=top_p,
533
+ top_k=top_k,
534
+ min_p=min_p,
535
+ seed=self.seed,
536
+ stop=self.stop,
537
+ stop_token_ids=self.stop_token_ids,
538
+ logprobs=self.top_logprobs if self.logprobs else None,
539
+ prompt_logprobs=prompt_logprobs,
540
+ ignore_eos=self.ignore_eos,
541
+ max_tokens=max_tokens,
542
+ min_tokens=self.min_tokens,
543
+ skip_special_tokens=self.skip_special_tokens,
544
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
545
+ logits_processors=get_logits_processors(self.logits_processors,
546
+ logits_processor_pattern),
547
+ include_stop_str_in_output=self.include_stop_str_in_output,
548
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
549
+ output_kind=RequestOutputKind.DELTA if self.stream \
550
+ else RequestOutputKind.FINAL_ONLY,
551
+ guided_decoding=guided_decoding,
552
+ logit_bias=self.logit_bias,
553
+ allowed_token_ids=self.allowed_token_ids,
554
+ extra_args=({"kv_transfer_params": self.kv_transfer_params}
555
+ if self.kv_transfer_params else None))
556
+
557
+ def _get_guided_json_from_tool(
558
+ self) -> Optional[Union[str, dict, BaseModel]]:
559
+ # user has chosen to not use any tool
560
+ if self.tool_choice == "none" or self.tools is None:
561
+ return None
562
+
563
+ # user has chosen to use a named tool
564
+ if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
565
+ tool_name = self.tool_choice.function.name
566
+ tools = {tool.function.name: tool.function for tool in self.tools}
567
+ if tool_name not in tools:
568
+ raise ValueError(
569
+ f"Tool '{tool_name}' has not been passed in `tools`.")
570
+ tool = tools[tool_name]
571
+ return tool.parameters
572
+
573
+ if self.tool_choice == "required":
574
+ # Pydantic schema generation cannot be used since the JSON schema
575
+ # has to be constructed for a specific instantiation of a tool list
576
+ # so that parameters of a function are correctly generated
577
+ # based on the chosen function name
578
+ def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
579
+ return {
580
+ "properties": {
581
+ "name": {
582
+ "type": "string",
583
+ "enum": [tool.function.name]
584
+ },
585
+ # parameters are always generated as '{}' in the final
586
+ # output if they are missing from the request
587
+ # (i.e. are None or '{}') so the schema is
588
+ # updated to produce an empty object in that case
589
+ "parameters": tool.function.parameters
590
+ if tool.function.parameters else {
591
+ "type": "object",
592
+ "properties": {}
593
+ }
594
+ },
595
+ "required": ["name", "parameters"]
596
+ }
597
+
598
+ json_schema = {
599
+ "type": "array",
600
+ "minItems": 1,
601
+ "items": {
602
+ "type": "object",
603
+ "anyOf": [get_tool_schema(tool) for tool in self.tools]
604
+ }
605
+ }
606
+ return json_schema
607
+
608
+ return None
609
+
610
+ @model_validator(mode="before")
611
+ @classmethod
612
+ def validate_stream_options(cls, data):
613
+ if data.get("stream_options") and not data.get("stream"):
614
+ raise ValueError(
615
+ "Stream options can only be defined when `stream=True`.")
616
+
617
+ return data
618
+
619
+ @model_validator(mode="before")
620
+ @classmethod
621
+ def check_logprobs(cls, data):
622
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
623
+ if data.get("stream") and prompt_logprobs > 0:
624
+ raise ValueError(
625
+ "`prompt_logprobs` are not available when `stream=True`.")
626
+
627
+ if prompt_logprobs < 0:
628
+ raise ValueError("`prompt_logprobs` must be a positive value.")
629
+
630
+ if (top_logprobs := data.get("top_logprobs")) is not None:
631
+ if top_logprobs < 0:
632
+ raise ValueError("`top_logprobs` must be a positive value.")
633
+
634
+ if top_logprobs > 0 and not data.get("logprobs"):
635
+ raise ValueError(
636
+ "when using `top_logprobs`, `logprobs` must be set to true."
637
+ )
638
+
639
+ return data
640
+
641
+ @model_validator(mode="before")
642
+ @classmethod
643
+ def check_guided_decoding_count(cls, data):
644
+ if isinstance(data, ValueError):
645
+ raise data
646
+
647
+ guide_count = sum([
648
+ "guided_json" in data and data["guided_json"] is not None,
649
+ "guided_regex" in data and data["guided_regex"] is not None,
650
+ "guided_choice" in data and data["guided_choice"] is not None
651
+ ])
652
+ # you can only use one kind of guided decoding
653
+ if guide_count > 1:
654
+ raise ValueError(
655
+ "You can only use one kind of guided decoding "
656
+ "('guided_json', 'guided_regex' or 'guided_choice').")
657
+ # you can only either use guided decoding or tools, not both
658
+ if guide_count > 1 and data.get("tool_choice", "none") not in (
659
+ "none",
660
+ "auto",
661
+ "required",
662
+ ):
663
+ raise ValueError(
664
+ "You can only either use guided decoding or tools, not both.")
665
+ return data
666
+
667
+ @model_validator(mode="before")
668
+ @classmethod
669
+ def check_tool_usage(cls, data):
670
+
671
+ # if "tool_choice" is not specified but tools are provided,
672
+ # default to "auto" tool_choice
673
+ if "tool_choice" not in data and data.get("tools"):
674
+ data["tool_choice"] = "auto"
675
+
676
+ # if "tool_choice" is "none" -- ignore tools if present
677
+ if "tool_choice" in data and data["tool_choice"] == "none":
678
+ # ensure that no tools are present
679
+ data.pop("tools", None)
680
+ return data
681
+
682
+ # if "tool_choice" is specified -- validation
683
+ if "tool_choice" in data:
684
+
685
+ # ensure that if "tool choice" is specified, tools are present
686
+ if "tools" not in data or data["tools"] is None:
687
+ raise ValueError(
688
+ "When using `tool_choice`, `tools` must be set.")
689
+
690
+ # make sure that tool choice is either a named tool
691
+ # OR that it's set to "auto" or "required"
692
+ if data["tool_choice"] not in [
693
+ "auto", "required"
694
+ ] and not isinstance(data["tool_choice"], dict):
695
+ raise NotImplementedError(
696
+ f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
697
+ 'Only named tools, "none", "auto" or "required" '\
698
+ 'are supported.'
699
+ )
700
+
701
+ # ensure that if "tool_choice" is specified as an object,
702
+ # it matches a valid tool
703
+ if isinstance(data["tool_choice"], dict):
704
+ valid_tool = False
705
+ specified_function = data["tool_choice"].get("function")
706
+ if not specified_function:
707
+ raise ValueError(
708
+ "Expected field `function` in `tool_choice`."
709
+ " Correct usage: `{\"type\": \"function\","
710
+ " \"function\": {\"name\": \"my_function\"}}`")
711
+ specified_function_name = specified_function.get("name")
712
+ if not specified_function_name:
713
+ raise ValueError(
714
+ "Expected field `name` in `function` in `tool_choice`."
715
+ "Correct usage: `{\"type\": \"function\", "
716
+ "\"function\": {\"name\": \"my_function\"}}`")
717
+ for tool in data["tools"]:
718
+ if tool["function"]["name"] == specified_function_name:
719
+ valid_tool = True
720
+ break
721
+ if not valid_tool:
722
+ raise ValueError(
723
+ "The tool specified in `tool_choice` does not match any"
724
+ " of the specified `tools`")
725
+ return data
726
+
727
+ @model_validator(mode="before")
728
+ @classmethod
729
+ def check_generation_prompt(cls, data):
730
+ if data.get("continue_final_message") and data.get(
731
+ "add_generation_prompt"):
732
+ raise ValueError("Cannot set both `continue_final_message` and "
733
+ "`add_generation_prompt` to True.")
734
+ return data
735
+
736
+ @model_validator(mode="before")
737
+ @classmethod
738
+ def check_cache_salt_support(cls, data):
739
+ if data.get("cache_salt") is not None:
740
+ if not envs.VLLM_USE_V1:
741
+ raise ValueError(
742
+ "Parameter 'cache_salt' is not supported with "
743
+ "this instance of vLLM, which uses engine V0.")
744
+ if not isinstance(data["cache_salt"],
745
+ str) or not data["cache_salt"]:
746
+ raise ValueError("Parameter 'cache_salt' must be a "
747
+ "non-empty string if provided.")
748
+ return data
749
+
750
+
751
+ class CompletionRequest(OpenAIBaseModel):
752
+ # Ordered by official OpenAI API documentation
753
+ # https://platform.openai.com/docs/api-reference/completions/create
754
+ model: Optional[str] = None
755
+ prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
756
+ prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
757
+ best_of: Optional[int] = None
758
+ echo: Optional[bool] = False
759
+ frequency_penalty: Optional[float] = 0.0
760
+ logit_bias: Optional[dict[str, float]] = None
761
+ logprobs: Optional[int] = None
762
+ max_tokens: Optional[int] = 16
763
+ n: int = 1
764
+ presence_penalty: Optional[float] = 0.0
765
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
766
+ stop: Optional[Union[str, list[str]]] = []
767
+ stream: Optional[bool] = False
768
+ stream_options: Optional[StreamOptions] = None
769
+ suffix: Optional[str] = None
770
+ temperature: Optional[float] = None
771
+ top_p: Optional[float] = None
772
+ user: Optional[str] = None
773
+
774
+ # --8<-- [start:completion-sampling-params]
775
+ use_beam_search: bool = False
776
+ top_k: Optional[int] = None
777
+ min_p: Optional[float] = None
778
+ repetition_penalty: Optional[float] = None
779
+ length_penalty: float = 1.0
780
+ stop_token_ids: Optional[list[int]] = []
781
+ include_stop_str_in_output: bool = False
782
+ ignore_eos: bool = False
783
+ min_tokens: int = 0
784
+ skip_special_tokens: bool = True
785
+ spaces_between_special_tokens: bool = True
786
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
787
+ allowed_token_ids: Optional[list[int]] = None
788
+ prompt_logprobs: Optional[int] = None
789
+ # --8<-- [end:completion-sampling-params]
790
+
791
+ # --8<-- [start:completion-extra-params]
792
+ add_special_tokens: bool = Field(
793
+ default=True,
794
+ description=(
795
+ "If true (the default), special tokens (e.g. BOS) will be added to "
796
+ "the prompt."),
797
+ )
798
+ response_format: Optional[AnyResponseFormat] = Field(
799
+ default=None,
800
+ description=(
801
+ "Similar to chat completion, this parameter specifies the format "
802
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
803
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
804
+ ),
805
+ )
806
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
807
+ default=None,
808
+ description="If specified, the output will follow the JSON schema.",
809
+ )
810
+ guided_regex: Optional[str] = Field(
811
+ default=None,
812
+ description=(
813
+ "If specified, the output will follow the regex pattern."),
814
+ )
815
+ guided_choice: Optional[list[str]] = Field(
816
+ default=None,
817
+ description=(
818
+ "If specified, the output will be exactly one of the choices."),
819
+ )
820
+ guided_grammar: Optional[str] = Field(
821
+ default=None,
822
+ description=(
823
+ "If specified, the output will follow the context free grammar."),
824
+ )
825
+ guided_decoding_backend: Optional[str] = Field(
826
+ default=None,
827
+ description=(
828
+ "If specified, will override the default guided decoding backend "
829
+ "of the server for this specific request. If set, must be one of "
830
+ "'outlines' / 'lm-format-enforcer'"),
831
+ )
832
+ guided_whitespace_pattern: Optional[str] = Field(
833
+ default=None,
834
+ description=(
835
+ "If specified, will override the default whitespace pattern "
836
+ "for guided json decoding."),
837
+ )
838
+ priority: int = Field(
839
+ default=0,
840
+ description=(
841
+ "The priority of the request (lower means earlier handling; "
842
+ "default: 0). Any priority other than 0 will raise an error "
843
+ "if the served model does not use priority scheduling."),
844
+ )
845
+ logits_processors: Optional[LogitsProcessors] = Field(
846
+ default=None,
847
+ description=(
848
+ "A list of either qualified names of logits processors, or "
849
+ "constructor objects, to apply when sampling. A constructor is "
850
+ "a JSON object with a required 'qualname' field specifying the "
851
+ "qualified name of the processor class/factory, and optional "
852
+ "'args' and 'kwargs' fields containing positional and keyword "
853
+ "arguments. For example: {'qualname': "
854
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
855
+ "{'param': 'value'}}."))
856
+
857
+ return_tokens_as_token_ids: Optional[bool] = Field(
858
+ default=None,
859
+ description=(
860
+ "If specified with 'logprobs', tokens are represented "
861
+ " as strings of the form 'token_id:{token_id}' so that tokens "
862
+ "that are not JSON-encodable can be identified."))
863
+
864
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
865
+ default=None,
866
+ description="KVTransfer parameters used for disaggregated serving.")
867
+
868
+ # --8<-- [end:completion-extra-params]
869
+
870
+ # Default sampling parameters for completion requests
871
+ _DEFAULT_SAMPLING_PARAMS: dict = {
872
+ "repetition_penalty": 1.0,
873
+ "temperature": 1.0,
874
+ "top_p": 1.0,
875
+ "top_k": 0,
876
+ "min_p": 0.0,
877
+ }
878
+
879
+ def to_beam_search_params(
880
+ self,
881
+ default_max_tokens: int,
882
+ default_sampling_params: Optional[dict] = None
883
+ ) -> BeamSearchParams:
884
+ max_tokens = self.max_tokens
885
+
886
+ if default_sampling_params is None:
887
+ default_sampling_params = {}
888
+ n = self.n if self.n is not None else 1
889
+
890
+ # Use minimum of context window, user request & server limit.
891
+ max_tokens = min(
892
+ val for val in (default_max_tokens, max_tokens,
893
+ default_sampling_params.get("max_tokens", None))
894
+ if val is not None)
895
+
896
+ if (temperature := self.temperature) is None:
897
+ temperature = default_sampling_params.get("temperature", 1.0)
898
+
899
+ return BeamSearchParams(
900
+ beam_width=n,
901
+ max_tokens=max_tokens,
902
+ ignore_eos=self.ignore_eos,
903
+ temperature=temperature,
904
+ length_penalty=self.length_penalty,
905
+ include_stop_str_in_output=self.include_stop_str_in_output,
906
+ )
907
+
908
+ def to_sampling_params(
909
+ self,
910
+ default_max_tokens: int,
911
+ logits_processor_pattern: Optional[str],
912
+ default_sampling_params: Optional[dict] = None,
913
+ ) -> SamplingParams:
914
+ max_tokens = self.max_tokens
915
+
916
+ if default_sampling_params is None:
917
+ default_sampling_params = {}
918
+
919
+ # Use minimum of context window, user request & server limit.
920
+ max_tokens = min(
921
+ val for val in (default_max_tokens, max_tokens,
922
+ default_sampling_params.get("max_tokens", None))
923
+ if val is not None)
924
+
925
+ # Default parameters
926
+ if (repetition_penalty := self.repetition_penalty) is None:
927
+ repetition_penalty = default_sampling_params.get(
928
+ "repetition_penalty",
929
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
930
+ )
931
+ if (temperature := self.temperature) is None:
932
+ temperature = default_sampling_params.get(
933
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
934
+ if (top_p := self.top_p) is None:
935
+ top_p = default_sampling_params.get(
936
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
937
+ if (top_k := self.top_k) is None:
938
+ top_k = default_sampling_params.get(
939
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
940
+ if (min_p := self.min_p) is None:
941
+ min_p = default_sampling_params.get(
942
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
943
+
944
+ prompt_logprobs = self.prompt_logprobs
945
+ if prompt_logprobs is None and self.echo:
946
+ prompt_logprobs = self.logprobs
947
+
948
+ echo_without_generation = self.echo and self.max_tokens == 0
949
+
950
+ guided_json_object = None
951
+ if (self.response_format is not None
952
+ and self.response_format.type == "json_object"):
953
+ guided_json_object = True
954
+
955
+ guided_decoding = GuidedDecodingParams.from_optional(
956
+ json=self.guided_json,
957
+ regex=self.guided_regex,
958
+ choice=self.guided_choice,
959
+ grammar=self.guided_grammar,
960
+ json_object=guided_json_object,
961
+ backend=self.guided_decoding_backend,
962
+ whitespace_pattern=self.guided_whitespace_pattern,
963
+ )
964
+
965
+ return SamplingParams.from_optional(
966
+ n=self.n,
967
+ best_of=self.best_of,
968
+ presence_penalty=self.presence_penalty,
969
+ frequency_penalty=self.frequency_penalty,
970
+ repetition_penalty=repetition_penalty,
971
+ temperature=temperature,
972
+ top_p=top_p,
973
+ top_k=top_k,
974
+ min_p=min_p,
975
+ seed=self.seed,
976
+ stop=self.stop,
977
+ stop_token_ids=self.stop_token_ids,
978
+ logprobs=self.logprobs,
979
+ ignore_eos=self.ignore_eos,
980
+ max_tokens=max_tokens if not echo_without_generation else 1,
981
+ min_tokens=self.min_tokens,
982
+ prompt_logprobs=prompt_logprobs,
983
+ skip_special_tokens=self.skip_special_tokens,
984
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
985
+ include_stop_str_in_output=self.include_stop_str_in_output,
986
+ logits_processors=get_logits_processors(self.logits_processors,
987
+ logits_processor_pattern),
988
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
989
+ output_kind=RequestOutputKind.DELTA if self.stream \
990
+ else RequestOutputKind.FINAL_ONLY,
991
+ guided_decoding=guided_decoding,
992
+ logit_bias=self.logit_bias,
993
+ allowed_token_ids=self.allowed_token_ids,
994
+ extra_args=({"kv_transfer_params": self.kv_transfer_params}
995
+ if self.kv_transfer_params else None))
996
+
997
+ @model_validator(mode="before")
998
+ @classmethod
999
+ def check_guided_decoding_count(cls, data):
1000
+ guide_count = sum([
1001
+ "guided_json" in data and data["guided_json"] is not None,
1002
+ "guided_regex" in data and data["guided_regex"] is not None,
1003
+ "guided_choice" in data and data["guided_choice"] is not None
1004
+ ])
1005
+ if guide_count > 1:
1006
+ raise ValueError(
1007
+ "You can only use one kind of guided decoding "
1008
+ "('guided_json', 'guided_regex' or 'guided_choice').")
1009
+ return data
1010
+
1011
+ @model_validator(mode="before")
1012
+ @classmethod
1013
+ def check_logprobs(cls, data):
1014
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
1015
+ if data.get("stream") and prompt_logprobs > 0:
1016
+ raise ValueError(
1017
+ "`prompt_logprobs` are not available when `stream=True`.")
1018
+
1019
+ if prompt_logprobs < 0:
1020
+ raise ValueError("`prompt_logprobs` must be a positive value.")
1021
+
1022
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
1023
+ raise ValueError("`logprobs` must be a positive value.")
1024
+
1025
+ return data
1026
+
1027
+ @model_validator(mode="before")
1028
+ @classmethod
1029
+ def validate_stream_options(cls, data):
1030
+ if data.get("stream_options") and not data.get("stream"):
1031
+ raise ValueError(
1032
+ "Stream options can only be defined when `stream=True`.")
1033
+
1034
+ return data
1035
+
1036
+ @model_validator(mode="before")
1037
+ @classmethod
1038
+ def validate_prompt_and_prompt_embeds(cls, data):
1039
+ if data.get("prompt") is None and data.get("prompt_embeds") is None:
1040
+ raise ValueError(
1041
+ "At least one of `prompt` or `prompt_embeds` must be set.")
1042
+ return data
1043
+
1044
+
1045
+ class EmbeddingCompletionRequest(OpenAIBaseModel):
1046
+ # Ordered by official OpenAI API documentation
1047
+ # https://platform.openai.com/docs/api-reference/embeddings
1048
+ model: Optional[str] = None
1049
+ input: Union[list[int], list[list[int]], str, list[str]]
1050
+ encoding_format: Literal["float", "base64"] = "float"
1051
+ dimensions: Optional[int] = None
1052
+ user: Optional[str] = None
1053
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1054
+
1055
+ # --8<-- [start:embedding-pooling-params]
1056
+ additional_data: Optional[Any] = None
1057
+ # --8<-- [end:embedding-pooling-params]
1058
+
1059
+ # --8<-- [start:embedding-extra-params]
1060
+ add_special_tokens: bool = Field(
1061
+ default=True,
1062
+ description=(
1063
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1064
+ "the prompt."),
1065
+ )
1066
+ priority: int = Field(
1067
+ default=0,
1068
+ description=(
1069
+ "The priority of the request (lower means earlier handling; "
1070
+ "default: 0). Any priority other than 0 will raise an error "
1071
+ "if the served model does not use priority scheduling."),
1072
+ )
1073
+
1074
+ # --8<-- [end:embedding-extra-params]
1075
+
1076
+ def to_pooling_params(self):
1077
+ return PoolingParams(dimensions=self.dimensions,
1078
+ additional_data=self.additional_data)
1079
+
1080
+
1081
+ class EmbeddingChatRequest(OpenAIBaseModel):
1082
+ model: Optional[str] = None
1083
+ messages: list[ChatCompletionMessageParam]
1084
+
1085
+ encoding_format: Literal["float", "base64"] = "float"
1086
+ dimensions: Optional[int] = None
1087
+ user: Optional[str] = None
1088
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1089
+
1090
+ # --8<-- [start:chat-embedding-pooling-params]
1091
+ additional_data: Optional[Any] = None
1092
+ # --8<-- [end:chat-embedding-pooling-params]
1093
+
1094
+ # --8<-- [start:chat-embedding-extra-params]
1095
+ add_special_tokens: bool = Field(
1096
+ default=False,
1097
+ description=(
1098
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1099
+ "on top of what is added by the chat template. "
1100
+ "For most models, the chat template takes care of adding the "
1101
+ "special tokens so this should be set to false (as is the "
1102
+ "default)."),
1103
+ )
1104
+ chat_template: Optional[str] = Field(
1105
+ default=None,
1106
+ description=(
1107
+ "A Jinja template to use for this conversion. "
1108
+ "As of transformers v4.44, default chat template is no longer "
1109
+ "allowed, so you must provide a chat template if the tokenizer "
1110
+ "does not define one."),
1111
+ )
1112
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1113
+ default=None,
1114
+ description=("Additional kwargs to pass to the template renderer. "
1115
+ "Will be accessible by the chat template."),
1116
+ )
1117
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1118
+ default=None,
1119
+ description=("Additional kwargs to pass to the HF processor."),
1120
+ )
1121
+ priority: int = Field(
1122
+ default=0,
1123
+ description=(
1124
+ "The priority of the request (lower means earlier handling; "
1125
+ "default: 0). Any priority other than 0 will raise an error "
1126
+ "if the served model does not use priority scheduling."),
1127
+ )
1128
+ # --8<-- [end:chat-embedding-extra-params]
1129
+
1130
+ @model_validator(mode="before")
1131
+ @classmethod
1132
+ def check_generation_prompt(cls, data):
1133
+ if data.get("continue_final_message") and data.get(
1134
+ "add_generation_prompt"):
1135
+ raise ValueError("Cannot set both `continue_final_message` and "
1136
+ "`add_generation_prompt` to True.")
1137
+ return data
1138
+
1139
+ def to_pooling_params(self):
1140
+ return PoolingParams(dimensions=self.dimensions,
1141
+ additional_data=self.additional_data)
1142
+
1143
+
1144
+ EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
1145
+
1146
+ PoolingCompletionRequest = EmbeddingCompletionRequest
1147
+ PoolingChatRequest = EmbeddingChatRequest
1148
+ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
1149
+
1150
+
1151
+ class ScoreRequest(OpenAIBaseModel):
1152
+ model: Optional[str] = None
1153
+ text_1: Union[list[str], str]
1154
+ text_2: Union[list[str], str]
1155
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1156
+
1157
+ # --8<-- [start:score-pooling-params]
1158
+ additional_data: Optional[Any] = None
1159
+ # --8<-- [end:score-pooling-params]
1160
+
1161
+ # --8<-- [start:score-extra-params]
1162
+ priority: int = Field(
1163
+ default=0,
1164
+ description=(
1165
+ "The priority of the request (lower means earlier handling; "
1166
+ "default: 0). Any priority other than 0 will raise an error "
1167
+ "if the served model does not use priority scheduling."),
1168
+ )
1169
+
1170
+ # --8<-- [end:score-extra-params]
1171
+
1172
+ def to_pooling_params(self):
1173
+ return PoolingParams(additional_data=self.additional_data)
1174
+
1175
+
1176
+ class RerankRequest(OpenAIBaseModel):
1177
+ model: Optional[str] = None
1178
+ query: str
1179
+ documents: list[str]
1180
+ top_n: int = Field(default_factory=lambda: 0)
1181
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
1182
+
1183
+ # --8<-- [start:rerank-pooling-params]
1184
+ additional_data: Optional[Any] = None
1185
+ # --8<-- [end:rerank-pooling-params]
1186
+
1187
+ # --8<-- [start:rerank-extra-params]
1188
+ priority: int = Field(
1189
+ default=0,
1190
+ description=(
1191
+ "The priority of the request (lower means earlier handling; "
1192
+ "default: 0). Any priority other than 0 will raise an error "
1193
+ "if the served model does not use priority scheduling."),
1194
+ )
1195
+
1196
+ # --8<-- [end:rerank-extra-params]
1197
+
1198
+ def to_pooling_params(self):
1199
+ return PoolingParams(additional_data=self.additional_data)
1200
+
1201
+
1202
+ class RerankDocument(BaseModel):
1203
+ text: str
1204
+
1205
+
1206
+ class RerankResult(BaseModel):
1207
+ index: int
1208
+ document: RerankDocument
1209
+ relevance_score: float
1210
+
1211
+
1212
+ class RerankUsage(BaseModel):
1213
+ total_tokens: int
1214
+
1215
+
1216
+ class RerankResponse(OpenAIBaseModel):
1217
+ id: str
1218
+ model: str
1219
+ usage: RerankUsage
1220
+ results: list[RerankResult]
1221
+
1222
+
1223
+ class CompletionLogProbs(OpenAIBaseModel):
1224
+ text_offset: list[int] = Field(default_factory=list)
1225
+ token_logprobs: list[Optional[float]] = Field(default_factory=list)
1226
+ tokens: list[str] = Field(default_factory=list)
1227
+ top_logprobs: list[Optional[dict[str,
1228
+ float]]] = Field(default_factory=list)
1229
+
1230
+
1231
+ class CompletionResponseChoice(OpenAIBaseModel):
1232
+ index: int
1233
+ text: str
1234
+ logprobs: Optional[CompletionLogProbs] = None
1235
+ finish_reason: Optional[str] = None
1236
+ stop_reason: Optional[Union[int, str]] = Field(
1237
+ default=None,
1238
+ description=(
1239
+ "The stop string or token id that caused the completion "
1240
+ "to stop, None if the completion finished for some other reason "
1241
+ "including encountering the EOS token"),
1242
+ )
1243
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1244
+
1245
+
1246
+ class CompletionResponse(OpenAIBaseModel):
1247
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1248
+ object: str = "text_completion"
1249
+ created: int = Field(default_factory=lambda: int(time.time()))
1250
+ model: str
1251
+ choices: list[CompletionResponseChoice]
1252
+ usage: UsageInfo
1253
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1254
+ default=None, description="KVTransfer parameters.")
1255
+
1256
+
1257
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1258
+ index: int
1259
+ text: str
1260
+ logprobs: Optional[CompletionLogProbs] = None
1261
+ finish_reason: Optional[str] = None
1262
+ stop_reason: Optional[Union[int, str]] = Field(
1263
+ default=None,
1264
+ description=(
1265
+ "The stop string or token id that caused the completion "
1266
+ "to stop, None if the completion finished for some other reason "
1267
+ "including encountering the EOS token"),
1268
+ )
1269
+
1270
+
1271
+ class CompletionStreamResponse(OpenAIBaseModel):
1272
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1273
+ object: str = "text_completion"
1274
+ created: int = Field(default_factory=lambda: int(time.time()))
1275
+ model: str
1276
+ choices: list[CompletionResponseStreamChoice]
1277
+ usage: Optional[UsageInfo] = Field(default=None)
1278
+
1279
+
1280
+ class EmbeddingResponseData(OpenAIBaseModel):
1281
+ index: int
1282
+ object: str = "embedding"
1283
+ embedding: Union[list[float], str]
1284
+
1285
+
1286
+ class EmbeddingResponse(OpenAIBaseModel):
1287
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1288
+ object: str = "list"
1289
+ created: int = Field(default_factory=lambda: int(time.time()))
1290
+ model: str
1291
+ data: list[EmbeddingResponseData]
1292
+ usage: UsageInfo
1293
+
1294
+
1295
+ class PoolingResponseData(OpenAIBaseModel):
1296
+ index: int
1297
+ object: str = "pooling"
1298
+ data: Union[list[list[float]], list[float], str]
1299
+
1300
+
1301
+ class PoolingResponse(OpenAIBaseModel):
1302
+ id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
1303
+ object: str = "list"
1304
+ created: int = Field(default_factory=lambda: int(time.time()))
1305
+ model: str
1306
+ data: list[PoolingResponseData]
1307
+ usage: UsageInfo
1308
+
1309
+
1310
+ class ScoreResponseData(OpenAIBaseModel):
1311
+ index: int
1312
+ object: str = "score"
1313
+ score: float
1314
+
1315
+
1316
+ class ScoreResponse(OpenAIBaseModel):
1317
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1318
+ object: str = "list"
1319
+ created: int = Field(default_factory=lambda: int(time.time()))
1320
+ model: str
1321
+ data: list[ScoreResponseData]
1322
+ usage: UsageInfo
1323
+
1324
+
1325
+ class ClassificationRequest(OpenAIBaseModel):
1326
+ model: Optional[str] = None
1327
+ input: Union[list[str], str]
1328
+ truncate_prompt_tokens: Optional[int] = None
1329
+ user: Optional[str] = None
1330
+
1331
+ # --8<-- [start:classification-pooling-params]
1332
+ additional_data: Optional[Any] = None
1333
+ # --8<-- [end:classification-pooling-params]
1334
+
1335
+ # --8<-- [start:classification-extra-params]
1336
+ priority: int = Field(
1337
+ default=0,
1338
+ description=(
1339
+ "The priority of the request (lower means earlier handling; "
1340
+ "default: 0). Any priority other than 0 will raise an error "
1341
+ "if the served model does not use priority scheduling."),
1342
+ )
1343
+
1344
+ # --8<-- [end:classification-extra-params]
1345
+
1346
+ def to_pooling_params(self):
1347
+ return PoolingParams(additional_data=self.additional_data)
1348
+
1349
+
1350
+ class ClassificationData(OpenAIBaseModel):
1351
+ index: int
1352
+ label: Optional[str]
1353
+ probs: list[float]
1354
+ num_classes: int
1355
+
1356
+
1357
+ class ClassificationResponse(OpenAIBaseModel):
1358
+ id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
1359
+ object: str = "list"
1360
+ created: int = Field(default_factory=lambda: int(time.time()))
1361
+ model: str
1362
+ data: list[ClassificationData]
1363
+ usage: UsageInfo
1364
+
1365
+
1366
+ class FunctionCall(OpenAIBaseModel):
1367
+ name: str
1368
+ arguments: str
1369
+
1370
+
1371
+ class ToolCall(OpenAIBaseModel):
1372
+ id: str = Field(default_factory=random_tool_call_id)
1373
+ type: Literal["function"] = "function"
1374
+ function: FunctionCall
1375
+
1376
+
1377
+ class DeltaFunctionCall(BaseModel):
1378
+ name: Optional[str] = None
1379
+ arguments: Optional[str] = None
1380
+
1381
+
1382
+ # a tool call delta where everything is optional
1383
+ class DeltaToolCall(OpenAIBaseModel):
1384
+ id: Optional[str] = None
1385
+ type: Optional[Literal["function"]] = None
1386
+ index: int
1387
+ function: Optional[DeltaFunctionCall] = None
1388
+
1389
+
1390
+ class ExtractedToolCallInformation(BaseModel):
1391
+ # indicate if tools were called
1392
+ tools_called: bool
1393
+
1394
+ # extracted tool calls
1395
+ tool_calls: list[ToolCall]
1396
+
1397
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1398
+ # But some models will do this intentionally
1399
+ content: Optional[str] = None
1400
+
1401
+
1402
+ class ChatMessage(OpenAIBaseModel):
1403
+ role: str
1404
+ reasoning_content: Optional[str] = None
1405
+ content: Optional[str] = None
1406
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1407
+
1408
+
1409
+ class ChatCompletionLogProb(OpenAIBaseModel):
1410
+ token: str
1411
+ logprob: float = -9999.0
1412
+ bytes: Optional[list[int]] = None
1413
+
1414
+
1415
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1416
+ # Workaround: redefine fields name cache so that it's not
1417
+ # shared with the super class.
1418
+ field_names: ClassVar[Optional[set[str]]] = None
1419
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1420
+
1421
+
1422
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1423
+ content: Optional[list[ChatCompletionLogProbsContent]] = None
1424
+
1425
+
1426
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1427
+ index: int
1428
+ message: ChatMessage
1429
+ logprobs: Optional[ChatCompletionLogProbs] = None
1430
+ # per OpenAI spec this is the default
1431
+ finish_reason: Optional[str] = "stop"
1432
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1433
+ stop_reason: Optional[Union[int, str]] = None
1434
+
1435
+
1436
+ class ChatCompletionResponse(OpenAIBaseModel):
1437
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1438
+ object: Literal["chat.completion"] = "chat.completion"
1439
+ created: int = Field(default_factory=lambda: int(time.time()))
1440
+ model: str
1441
+ choices: list[ChatCompletionResponseChoice]
1442
+ usage: UsageInfo
1443
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1444
+ kv_transfer_params: Optional[dict[str, Any]] = Field(
1445
+ default=None, description="KVTransfer parameters.")
1446
+
1447
+
1448
+ class DeltaMessage(OpenAIBaseModel):
1449
+ role: Optional[str] = None
1450
+ content: Optional[str] = None
1451
+ reasoning_content: Optional[str] = None
1452
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1453
+
1454
+
1455
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1456
+ index: int
1457
+ delta: DeltaMessage
1458
+ logprobs: Optional[ChatCompletionLogProbs] = None
1459
+ finish_reason: Optional[str] = None
1460
+ stop_reason: Optional[Union[int, str]] = None
1461
+
1462
+
1463
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1464
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1465
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1466
+ created: int = Field(default_factory=lambda: int(time.time()))
1467
+ model: str
1468
+ choices: list[ChatCompletionResponseStreamChoice]
1469
+ usage: Optional[UsageInfo] = Field(default=None)
1470
+
1471
+
1472
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1473
+ delta: DeltaMessage
1474
+ finish_reason: Optional[str] = None
1475
+ stop_reason: Optional[Union[int, str]] = None
1476
+
1477
+
1478
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1479
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1480
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1481
+ created: int = Field(default_factory=lambda: int(time.time()))
1482
+ model: str
1483
+ choices: list[TranscriptionResponseStreamChoice]
1484
+ usage: Optional[UsageInfo] = Field(default=None)
1485
+
1486
+
1487
+ BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
1488
+ ScoreRequest, RerankRequest]
1489
+
1490
+
1491
+ class BatchRequestInput(OpenAIBaseModel):
1492
+ """
1493
+ The per-line object of the batch input file.
1494
+
1495
+ NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
1496
+ """
1497
+
1498
+ # A developer-provided per-request id that will be used to match outputs to
1499
+ # inputs. Must be unique for each request in a batch.
1500
+ custom_id: str
1501
+
1502
+ # The HTTP method to be used for the request. Currently only POST is
1503
+ # supported.
1504
+ method: str
1505
+
1506
+ # The OpenAI API relative URL to be used for the request. Currently
1507
+ # /v1/chat/completions is supported.
1508
+ url: str
1509
+
1510
+ # The parameters of the request.
1511
+ body: BatchRequestInputBody
1512
+
1513
+ @field_validator('body', mode='plain')
1514
+ @classmethod
1515
+ def check_type_for_url(cls, value: Any, info: ValidationInfo):
1516
+ # Use url to disambiguate models
1517
+ url: str = info.data["url"]
1518
+ if url == "/v1/chat/completions":
1519
+ return ChatCompletionRequest.model_validate(value)
1520
+ if url == "/v1/embeddings":
1521
+ return TypeAdapter(EmbeddingRequest).validate_python(value)
1522
+ if url.endswith("/score"):
1523
+ return ScoreRequest.model_validate(value)
1524
+ if url.endswith("/rerank"):
1525
+ return RerankRequest.model_validate(value)
1526
+ return TypeAdapter(BatchRequestInputBody).validate_python(value)
1527
+
1528
+
1529
+ class BatchResponseData(OpenAIBaseModel):
1530
+ # HTTP status code of the response.
1531
+ status_code: int = 200
1532
+
1533
+ # An unique identifier for the API request.
1534
+ request_id: str
1535
+
1536
+ # The body of the response.
1537
+ body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
1538
+ ScoreResponse, RerankResponse]] = None
1539
+
1540
+
1541
+ class BatchRequestOutput(OpenAIBaseModel):
1542
+ """
1543
+ The per-line object of the batch output and error files
1544
+ """
1545
+
1546
+ id: str
1547
+
1548
+ # A developer-provided per-request id that will be used to match outputs to
1549
+ # inputs.
1550
+ custom_id: str
1551
+
1552
+ response: Optional[BatchResponseData]
1553
+
1554
+ # For requests that failed with a non-HTTP error, this will contain more
1555
+ # information on the cause of the failure.
1556
+ error: Optional[Any]
1557
+
1558
+
1559
+ class TokenizeCompletionRequest(OpenAIBaseModel):
1560
+ model: Optional[str] = None
1561
+ prompt: str
1562
+
1563
+ add_special_tokens: bool = Field(
1564
+ default=True,
1565
+ description=(
1566
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1567
+ "the prompt."),
1568
+ )
1569
+ return_token_strs: Optional[bool] = Field(
1570
+ default=False,
1571
+ description=("If true, also return the token strings "
1572
+ "corresponding to the token ids."),
1573
+ )
1574
+
1575
+
1576
+ class TokenizeChatRequest(OpenAIBaseModel):
1577
+ model: Optional[str] = None
1578
+ messages: list[ChatCompletionMessageParam]
1579
+
1580
+ add_generation_prompt: bool = Field(
1581
+ default=True,
1582
+ description=
1583
+ ("If true, the generation prompt will be added to the chat template. "
1584
+ "This is a parameter used by chat template in tokenizer config of the "
1585
+ "model."),
1586
+ )
1587
+ return_token_strs: Optional[bool] = Field(
1588
+ default=False,
1589
+ description=("If true, also return the token strings "
1590
+ "corresponding to the token ids."),
1591
+ )
1592
+ continue_final_message: bool = Field(
1593
+ default=False,
1594
+ description=
1595
+ ("If this is set, the chat will be formatted so that the final "
1596
+ "message in the chat is open-ended, without any EOS tokens. The "
1597
+ "model will continue this message rather than starting a new one. "
1598
+ "This allows you to \"prefill\" part of the model's response for it. "
1599
+ "Cannot be used at the same time as `add_generation_prompt`."),
1600
+ )
1601
+ add_special_tokens: bool = Field(
1602
+ default=False,
1603
+ description=(
1604
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1605
+ "on top of what is added by the chat template. "
1606
+ "For most models, the chat template takes care of adding the "
1607
+ "special tokens so this should be set to false (as is the "
1608
+ "default)."),
1609
+ )
1610
+ chat_template: Optional[str] = Field(
1611
+ default=None,
1612
+ description=(
1613
+ "A Jinja template to use for this conversion. "
1614
+ "As of transformers v4.44, default chat template is no longer "
1615
+ "allowed, so you must provide a chat template if the tokenizer "
1616
+ "does not define one."),
1617
+ )
1618
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1619
+ default=None,
1620
+ description=("Additional kwargs to pass to the template renderer. "
1621
+ "Will be accessible by the chat template."),
1622
+ )
1623
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1624
+ default=None,
1625
+ description=("Additional kwargs to pass to the HF processor."),
1626
+ )
1627
+ tools: Optional[list[ChatCompletionToolsParam]] = Field(
1628
+ default=None,
1629
+ description=("A list of tools the model may call."),
1630
+ )
1631
+
1632
+ @model_validator(mode="before")
1633
+ @classmethod
1634
+ def check_generation_prompt(cls, data):
1635
+ if data.get("continue_final_message") and data.get(
1636
+ "add_generation_prompt"):
1637
+ raise ValueError("Cannot set both `continue_final_message` and "
1638
+ "`add_generation_prompt` to True.")
1639
+ return data
1640
+
1641
+
1642
+ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
1643
+
1644
+
1645
+ class TokenizeResponse(OpenAIBaseModel):
1646
+ count: int
1647
+ max_model_len: int
1648
+ tokens: list[int]
1649
+ token_strs: Optional[list[str]] = None
1650
+
1651
+
1652
+ class DetokenizeRequest(OpenAIBaseModel):
1653
+ model: Optional[str] = None
1654
+ tokens: list[int]
1655
+
1656
+
1657
+ class DetokenizeResponse(OpenAIBaseModel):
1658
+ prompt: str
1659
+
1660
+
1661
+ class LoadLoRAAdapterRequest(BaseModel):
1662
+ lora_name: str
1663
+ lora_path: str
1664
+
1665
+
1666
+ class UnloadLoRAAdapterRequest(BaseModel):
1667
+ lora_name: str
1668
+ lora_int_id: Optional[int] = Field(default=None)
1669
+
1670
+
1671
+ ## Protocols for Audio
1672
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
1673
+ "vtt"]
1674
+
1675
+
1676
+ class TranscriptionRequest(OpenAIBaseModel):
1677
+ # Ordered by official OpenAI API documentation
1678
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
1679
+
1680
+ file: UploadFile
1681
+ """
1682
+ The audio file object (not file name) to transcribe, in one of these
1683
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
1684
+ """
1685
+
1686
+ model: Optional[str] = None
1687
+ """ID of the model to use.
1688
+ """
1689
+
1690
+ language: Optional[str] = None
1691
+ """The language of the input audio.
1692
+
1693
+ Supplying the input language in
1694
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
1695
+ will improve accuracy and latency.
1696
+ """
1697
+
1698
+ prompt: str = Field(default="")
1699
+ """An optional text to guide the model's style or continue a previous audio
1700
+ segment.
1701
+
1702
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1703
+ should match the audio language.
1704
+ """
1705
+
1706
+ response_format: AudioResponseFormat = Field(default="json")
1707
+ """
1708
+ The format of the output, in one of these options: `json`, `text`, `srt`,
1709
+ `verbose_json`, or `vtt`.
1710
+ """
1711
+
1712
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
1713
+
1714
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
1715
+ alias="timestamp_granularities[]", default=[])
1716
+ """The timestamp granularities to populate for this transcription.
1717
+
1718
+ `response_format` must be set `verbose_json` to use timestamp granularities.
1719
+ Either or both of these options are supported: `word`, or `segment`. Note:
1720
+ There is no additional latency for segment timestamps, but generating word
1721
+ timestamps incurs additional latency.
1722
+ """
1723
+
1724
+ # --8<-- [start:transcription-extra-params]
1725
+ stream: Optional[bool] = False
1726
+ """Custom field not present in the original OpenAI definition. When set,
1727
+ it will enable output to be streamed in a similar fashion as the Chat
1728
+ Completion endpoint.
1729
+ """
1730
+ # Flattened stream option to simplify form data.
1731
+ stream_include_usage: Optional[bool] = False
1732
+ stream_continuous_usage_stats: Optional[bool] = False
1733
+ # --8<-- [end:transcription-extra-params]
1734
+
1735
+ # --8<-- [start:transcription-sampling-params]
1736
+ temperature: float = Field(default=0.0)
1737
+ """The sampling temperature, between 0 and 1.
1738
+
1739
+ Higher values like 0.8 will make the output more random, while lower values
1740
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
1741
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
1742
+ to automatically increase the temperature until certain thresholds are hit.
1743
+ """
1744
+
1745
+ top_p: Optional[float] = None
1746
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
1747
+ smallest possible set whose cumulative probability exceeds `p`.
1748
+ """
1749
+
1750
+ top_k: Optional[int] = None
1751
+ """Limits sampling to the `k` most probable tokens at each step."""
1752
+
1753
+ min_p: Optional[float] = None
1754
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
1755
+ minimum likelihood threshold during sampling.
1756
+ """
1757
+
1758
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1759
+ """The seed to use for sampling."""
1760
+
1761
+ frequency_penalty: Optional[float] = 0.0
1762
+ """The frequency penalty to use for sampling."""
1763
+
1764
+ repetition_penalty: Optional[float] = None
1765
+ """The repetition penalty to use for sampling."""
1766
+
1767
+ presence_penalty: Optional[float] = 0.0
1768
+ """The presence penalty to use for sampling."""
1769
+ # --8<-- [end:transcription-sampling-params]
1770
+
1771
+ # Default sampling parameters for transcription requests.
1772
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1773
+ "repetition_penalty": 1.0,
1774
+ "temperature": 1.0,
1775
+ "top_p": 1.0,
1776
+ "top_k": 0,
1777
+ "min_p": 0.0,
1778
+ }
1779
+
1780
+ def to_sampling_params(
1781
+ self,
1782
+ default_max_tokens: int,
1783
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
1784
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
1785
+ max_tokens = default_max_tokens
1786
+
1787
+ if default_sampling_params is None:
1788
+ default_sampling_params = {}
1789
+
1790
+ # Default parameters
1791
+ if (temperature := self.temperature) is None:
1792
+ temperature = default_sampling_params.get(
1793
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
1794
+ if (top_p := self.top_p) is None:
1795
+ top_p = default_sampling_params.get(
1796
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
1797
+ if (top_k := self.top_k) is None:
1798
+ top_k = default_sampling_params.get(
1799
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
1800
+ if (min_p := self.min_p) is None:
1801
+ min_p = default_sampling_params.get(
1802
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
1803
+
1804
+ if (repetition_penalty := self.repetition_penalty) is None:
1805
+ repetition_penalty = default_sampling_params.get(
1806
+ "repetition_penalty",
1807
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
1808
+
1809
+ return SamplingParams.from_optional(temperature=temperature,
1810
+ max_tokens=max_tokens,
1811
+ seed=self.seed,
1812
+ top_p=top_p,
1813
+ top_k=top_k,
1814
+ min_p=min_p,
1815
+ frequency_penalty=self.frequency_penalty,
1816
+ repetition_penalty=repetition_penalty,
1817
+ presence_penalty=self.presence_penalty,
1818
+ output_kind=RequestOutputKind.DELTA
1819
+ if self.stream \
1820
+ else RequestOutputKind.FINAL_ONLY)
1821
+
1822
+ @model_validator(mode="before")
1823
+ @classmethod
1824
+ def validate_transcription_request(cls, data):
1825
+ if isinstance(data.get("file"), str):
1826
+ raise HTTPException(
1827
+ status_code=HTTPStatus.UNPROCESSABLE_ENTITY,
1828
+ detail="Expected 'file' to be a file-like object, not 'str'.",
1829
+ )
1830
+
1831
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
1832
+ stream = data.get("stream", False)
1833
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
1834
+ raise ValueError(
1835
+ "Stream options can only be defined when `stream=True`.")
1836
+
1837
+ return data
1838
+
1839
+
1840
+ # Transcription response objects
1841
+ class TranscriptionResponse(OpenAIBaseModel):
1842
+ text: str
1843
+ """The transcribed text."""
1844
+
1845
+
1846
+ class TranscriptionWord(OpenAIBaseModel):
1847
+ end: float
1848
+ """End time of the word in seconds."""
1849
+
1850
+ start: float
1851
+ """Start time of the word in seconds."""
1852
+
1853
+ word: str
1854
+ """The text content of the word."""
1855
+
1856
+
1857
+ class TranscriptionSegment(OpenAIBaseModel):
1858
+ id: int
1859
+ """Unique identifier of the segment."""
1860
+
1861
+ avg_logprob: float
1862
+ """Average logprob of the segment.
1863
+
1864
+ If the value is lower than -1, consider the logprobs failed.
1865
+ """
1866
+
1867
+ compression_ratio: float
1868
+ """Compression ratio of the segment.
1869
+
1870
+ If the value is greater than 2.4, consider the compression failed.
1871
+ """
1872
+
1873
+ end: float
1874
+ """End time of the segment in seconds."""
1875
+
1876
+ no_speech_prob: float
1877
+ """Probability of no speech in the segment.
1878
+
1879
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
1880
+ this segment silent.
1881
+ """
1882
+
1883
+ seek: int
1884
+ """Seek offset of the segment."""
1885
+
1886
+ start: float
1887
+ """Start time of the segment in seconds."""
1888
+
1889
+ temperature: float
1890
+ """Temperature parameter used for generating the segment."""
1891
+
1892
+ text: str
1893
+ """Text content of the segment."""
1894
+
1895
+ tokens: list[int]
1896
+ """Array of token IDs for the text content."""
1897
+
1898
+
1899
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
1900
+ duration: str
1901
+ """The duration of the input audio."""
1902
+
1903
+ language: str
1904
+ """The language of the input audio."""
1905
+
1906
+ text: str
1907
+ """The transcribed text."""
1908
+
1909
+ segments: Optional[list[TranscriptionSegment]] = None
1910
+ """Segments of the transcribed text and their corresponding details."""
1911
+
1912
+ words: Optional[list[TranscriptionWord]] = None
1913
+ """Extracted words and their corresponding timestamps."""