vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +170 -0
  3. vllm/_custom_ops.py +1536 -0
  4. vllm/_ipex_ops.py +241 -0
  5. vllm/_version.py +34 -0
  6. vllm/adapter_commons/__init__.py +0 -0
  7. vllm/adapter_commons/layers.py +16 -0
  8. vllm/adapter_commons/models.py +105 -0
  9. vllm/adapter_commons/request.py +25 -0
  10. vllm/adapter_commons/utils.py +92 -0
  11. vllm/adapter_commons/worker_manager.py +38 -0
  12. vllm/assets/__init__.py +0 -0
  13. vllm/assets/audio.py +38 -0
  14. vllm/assets/base.py +40 -0
  15. vllm/assets/image.py +31 -0
  16. vllm/assets/video.py +103 -0
  17. vllm/attention/__init__.py +19 -0
  18. vllm/attention/backends/__init__.py +0 -0
  19. vllm/attention/backends/abstract.py +306 -0
  20. vllm/attention/backends/blocksparse_attn.py +457 -0
  21. vllm/attention/backends/cpu_mla.py +303 -0
  22. vllm/attention/backends/flash_attn.py +999 -0
  23. vllm/attention/backends/flashinfer.py +1092 -0
  24. vllm/attention/backends/flashmla.py +242 -0
  25. vllm/attention/backends/hpu_attn.py +301 -0
  26. vllm/attention/backends/ipex_attn.py +396 -0
  27. vllm/attention/backends/mla/__init__.py +0 -0
  28. vllm/attention/backends/mla/common.py +1444 -0
  29. vllm/attention/backends/pallas.py +346 -0
  30. vllm/attention/backends/placeholder_attn.py +399 -0
  31. vllm/attention/backends/rocm_aiter_mla.py +412 -0
  32. vllm/attention/backends/rocm_flash_attn.py +969 -0
  33. vllm/attention/backends/torch_sdpa.py +691 -0
  34. vllm/attention/backends/triton_mla.py +113 -0
  35. vllm/attention/backends/utils.py +609 -0
  36. vllm/attention/backends/xformers.py +798 -0
  37. vllm/attention/layer.py +443 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/blocksparse_attention/__init__.py +0 -0
  40. vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +432 -0
  41. vllm/attention/ops/blocksparse_attention/interface.py +238 -0
  42. vllm/attention/ops/blocksparse_attention/utils.py +244 -0
  43. vllm/attention/ops/chunked_prefill_paged_decode.py +366 -0
  44. vllm/attention/ops/flashmla.py +115 -0
  45. vllm/attention/ops/hpu_paged_attn.py +105 -0
  46. vllm/attention/ops/ipex_attn.py +193 -0
  47. vllm/attention/ops/merge_attn_states.py +42 -0
  48. vllm/attention/ops/nki_flash_attn.py +905 -0
  49. vllm/attention/ops/paged_attn.py +255 -0
  50. vllm/attention/ops/prefix_prefill.py +902 -0
  51. vllm/attention/ops/rocm_aiter_mla.py +42 -0
  52. vllm/attention/ops/rocm_aiter_paged_attn.py +101 -0
  53. vllm/attention/ops/triton_decode_attention.py +675 -0
  54. vllm/attention/ops/triton_flash_attention.py +1375 -0
  55. vllm/attention/ops/triton_merge_attn_states.py +96 -0
  56. vllm/attention/selector.py +186 -0
  57. vllm/attention/utils/fa_utils.py +54 -0
  58. vllm/beam_search.py +82 -0
  59. vllm/benchmarks/__init__.py +0 -0
  60. vllm/benchmarks/datasets.py +831 -0
  61. vllm/benchmarks/endpoint_request_func.py +160 -0
  62. vllm/benchmarks/latency.py +181 -0
  63. vllm/benchmarks/serve.py +925 -0
  64. vllm/benchmarks/throughput.py +608 -0
  65. vllm/benchmarks/utils.py +69 -0
  66. vllm/collect_env.py +795 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/backends.py +715 -0
  69. vllm/compilation/compiler_interface.py +437 -0
  70. vllm/compilation/counter.py +33 -0
  71. vllm/compilation/decorators.py +249 -0
  72. vllm/compilation/fix_functionalization.py +182 -0
  73. vllm/compilation/fusion.py +617 -0
  74. vllm/compilation/fx_utils.py +60 -0
  75. vllm/compilation/inductor_pass.py +114 -0
  76. vllm/compilation/monitor.py +38 -0
  77. vllm/compilation/multi_output_match.py +108 -0
  78. vllm/compilation/noop_elimination.py +135 -0
  79. vllm/compilation/pass_manager.py +74 -0
  80. vllm/compilation/sequence_parallelism.py +266 -0
  81. vllm/compilation/torch25_custom_graph_pass.py +41 -0
  82. vllm/compilation/vllm_inductor_pass.py +68 -0
  83. vllm/compilation/wrapper.py +129 -0
  84. vllm/config.py +4179 -0
  85. vllm/connections.py +170 -0
  86. vllm/core/__init__.py +0 -0
  87. vllm/core/block/__init__.py +0 -0
  88. vllm/core/block/block_table.py +398 -0
  89. vllm/core/block/common.py +370 -0
  90. vllm/core/block/cpu_gpu_block_allocator.py +440 -0
  91. vllm/core/block/interfaces.py +318 -0
  92. vllm/core/block/naive_block.py +465 -0
  93. vllm/core/block/prefix_caching_block.py +1134 -0
  94. vllm/core/block/utils.py +27 -0
  95. vllm/core/block_manager.py +520 -0
  96. vllm/core/evictor.py +156 -0
  97. vllm/core/interfaces.py +134 -0
  98. vllm/core/placeholder_block_space_manager.py +99 -0
  99. vllm/core/scheduler.py +2060 -0
  100. vllm/device_allocator/__init__.py +0 -0
  101. vllm/device_allocator/cumem.py +280 -0
  102. vllm/distributed/__init__.py +5 -0
  103. vllm/distributed/communication_op.py +40 -0
  104. vllm/distributed/device_communicators/__init__.py +0 -0
  105. vllm/distributed/device_communicators/base_device_communicator.py +151 -0
  106. vllm/distributed/device_communicators/cpu_communicator.py +139 -0
  107. vllm/distributed/device_communicators/cuda_communicator.py +131 -0
  108. vllm/distributed/device_communicators/cuda_wrapper.py +179 -0
  109. vllm/distributed/device_communicators/custom_all_reduce.py +301 -0
  110. vllm/distributed/device_communicators/custom_all_reduce_utils.py +257 -0
  111. vllm/distributed/device_communicators/hpu_communicator.py +45 -0
  112. vllm/distributed/device_communicators/neuron_communicator.py +19 -0
  113. vllm/distributed/device_communicators/pynccl.py +217 -0
  114. vllm/distributed/device_communicators/pynccl_wrapper.py +340 -0
  115. vllm/distributed/device_communicators/shm_broadcast.py +557 -0
  116. vllm/distributed/device_communicators/tpu_communicator.py +93 -0
  117. vllm/distributed/device_communicators/xpu_communicator.py +54 -0
  118. vllm/distributed/kv_transfer/README.md +29 -0
  119. vllm/distributed/kv_transfer/__init__.py +11 -0
  120. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  121. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  122. vllm/distributed/kv_transfer/kv_connector/base.py +127 -0
  123. vllm/distributed/kv_transfer/kv_connector/factory.py +107 -0
  124. vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +98 -0
  125. vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +201 -0
  126. vllm/distributed/kv_transfer/kv_connector/simple_connector.py +328 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +90 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +8 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +209 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +131 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +383 -0
  132. vllm/distributed/kv_transfer/kv_connector_agent.py +76 -0
  133. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  134. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +174 -0
  135. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +160 -0
  136. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +236 -0
  137. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  138. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  139. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +279 -0
  140. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +279 -0
  141. vllm/distributed/kv_transfer/kv_transfer_state.py +70 -0
  142. vllm/distributed/parallel_state.py +1209 -0
  143. vllm/distributed/utils.py +366 -0
  144. vllm/engine/__init__.py +0 -0
  145. vllm/engine/arg_utils.py +1724 -0
  146. vllm/engine/async_llm_engine.py +1261 -0
  147. vllm/engine/async_timeout.py +191 -0
  148. vllm/engine/llm_engine.py +2150 -0
  149. vllm/engine/metrics.py +717 -0
  150. vllm/engine/metrics_types.py +96 -0
  151. vllm/engine/multiprocessing/__init__.py +183 -0
  152. vllm/engine/multiprocessing/client.py +745 -0
  153. vllm/engine/multiprocessing/engine.py +450 -0
  154. vllm/engine/output_processor/__init__.py +0 -0
  155. vllm/engine/output_processor/interfaces.py +74 -0
  156. vllm/engine/output_processor/multi_step.py +210 -0
  157. vllm/engine/output_processor/single_step.py +136 -0
  158. vllm/engine/output_processor/stop_checker.py +130 -0
  159. vllm/engine/output_processor/util.py +27 -0
  160. vllm/engine/protocol.py +302 -0
  161. vllm/entrypoints/__init__.py +0 -0
  162. vllm/entrypoints/api_server.py +177 -0
  163. vllm/entrypoints/chat_utils.py +1259 -0
  164. vllm/entrypoints/cli/__init__.py +0 -0
  165. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  166. vllm/entrypoints/cli/benchmark/base.py +38 -0
  167. vllm/entrypoints/cli/benchmark/latency.py +29 -0
  168. vllm/entrypoints/cli/benchmark/main.py +53 -0
  169. vllm/entrypoints/cli/benchmark/serve.py +29 -0
  170. vllm/entrypoints/cli/benchmark/throughput.py +29 -0
  171. vllm/entrypoints/cli/collect_env.py +35 -0
  172. vllm/entrypoints/cli/main.py +59 -0
  173. vllm/entrypoints/cli/openai.py +175 -0
  174. vllm/entrypoints/cli/serve.py +59 -0
  175. vllm/entrypoints/cli/types.py +24 -0
  176. vllm/entrypoints/launcher.py +146 -0
  177. vllm/entrypoints/llm.py +1450 -0
  178. vllm/entrypoints/logger.py +44 -0
  179. vllm/entrypoints/openai/__init__.py +0 -0
  180. vllm/entrypoints/openai/api_server.py +1130 -0
  181. vllm/entrypoints/openai/cli_args.py +296 -0
  182. vllm/entrypoints/openai/logits_processors.py +89 -0
  183. vllm/entrypoints/openai/protocol.py +1806 -0
  184. vllm/entrypoints/openai/run_batch.py +439 -0
  185. vllm/entrypoints/openai/serving_chat.py +1210 -0
  186. vllm/entrypoints/openai/serving_completion.py +557 -0
  187. vllm/entrypoints/openai/serving_embedding.py +245 -0
  188. vllm/entrypoints/openai/serving_engine.py +569 -0
  189. vllm/entrypoints/openai/serving_models.py +314 -0
  190. vllm/entrypoints/openai/serving_pooling.py +237 -0
  191. vllm/entrypoints/openai/serving_score.py +439 -0
  192. vllm/entrypoints/openai/serving_tokenization.py +147 -0
  193. vllm/entrypoints/openai/serving_transcription.py +421 -0
  194. vllm/entrypoints/openai/tool_parsers/__init__.py +19 -0
  195. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +163 -0
  196. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +254 -0
  197. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +232 -0
  198. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +370 -0
  199. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +211 -0
  200. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +303 -0
  201. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +262 -0
  202. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +342 -0
  203. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +110 -0
  204. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +292 -0
  205. vllm/entrypoints/openai/tool_parsers/utils.py +123 -0
  206. vllm/entrypoints/score_utils.py +49 -0
  207. vllm/entrypoints/ssl.py +74 -0
  208. vllm/entrypoints/utils.py +136 -0
  209. vllm/env_override.py +34 -0
  210. vllm/envs.py +800 -0
  211. vllm/executor/__init__.py +0 -0
  212. vllm/executor/executor_base.py +400 -0
  213. vllm/executor/mp_distributed_executor.py +243 -0
  214. vllm/executor/msgspec_utils.py +29 -0
  215. vllm/executor/multiproc_worker_utils.py +312 -0
  216. vllm/executor/ray_distributed_executor.py +700 -0
  217. vllm/executor/ray_utils.py +400 -0
  218. vllm/executor/uniproc_executor.py +141 -0
  219. vllm/forward_context.py +159 -0
  220. vllm/inputs/__init__.py +37 -0
  221. vllm/inputs/data.py +248 -0
  222. vllm/inputs/parse.py +121 -0
  223. vllm/inputs/preprocess.py +745 -0
  224. vllm/inputs/registry.py +212 -0
  225. vllm/jsontree.py +79 -0
  226. vllm/logger.py +210 -0
  227. vllm/logging_utils/__init__.py +7 -0
  228. vllm/logging_utils/formatter.py +17 -0
  229. vllm/logits_process.py +121 -0
  230. vllm/lora/__init__.py +0 -0
  231. vllm/lora/fully_sharded_layers.py +335 -0
  232. vllm/lora/layers.py +1263 -0
  233. vllm/lora/lora.py +198 -0
  234. vllm/lora/models.py +802 -0
  235. vllm/lora/ops/__init__.py +0 -0
  236. vllm/lora/ops/torch_ops/__init__.py +15 -0
  237. vllm/lora/ops/torch_ops/lora_ops.py +115 -0
  238. vllm/lora/ops/triton_ops/__init__.py +11 -0
  239. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  240. vllm/lora/ops/triton_ops/lora_expand.py +293 -0
  241. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +147 -0
  242. vllm/lora/ops/triton_ops/lora_shrink.py +247 -0
  243. vllm/lora/ops/triton_ops/utils.py +121 -0
  244. vllm/lora/peft_helper.py +115 -0
  245. vllm/lora/punica_wrapper/__init__.py +9 -0
  246. vllm/lora/punica_wrapper/punica_base.py +483 -0
  247. vllm/lora/punica_wrapper/punica_cpu.py +348 -0
  248. vllm/lora/punica_wrapper/punica_gpu.py +289 -0
  249. vllm/lora/punica_wrapper/punica_hpu.py +144 -0
  250. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  251. vllm/lora/punica_wrapper/utils.py +161 -0
  252. vllm/lora/request.py +97 -0
  253. vllm/lora/resolver.py +83 -0
  254. vllm/lora/utils.py +237 -0
  255. vllm/lora/worker_manager.py +251 -0
  256. vllm/model_executor/__init__.py +15 -0
  257. vllm/model_executor/custom_op.py +153 -0
  258. vllm/model_executor/guided_decoding/__init__.py +180 -0
  259. vllm/model_executor/guided_decoding/guidance_decoding.py +63 -0
  260. vllm/model_executor/guided_decoding/guidance_logits_processors.py +85 -0
  261. vllm/model_executor/guided_decoding/guided_fields.py +42 -0
  262. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +66 -0
  263. vllm/model_executor/guided_decoding/outlines_decoding.py +154 -0
  264. vllm/model_executor/guided_decoding/outlines_logits_processors.py +271 -0
  265. vllm/model_executor/guided_decoding/reasoner/__init__.py +35 -0
  266. vllm/model_executor/guided_decoding/utils.py +241 -0
  267. vllm/model_executor/guided_decoding/xgrammar_decoding.py +425 -0
  268. vllm/model_executor/layers/__init__.py +0 -0
  269. vllm/model_executor/layers/activation.py +368 -0
  270. vllm/model_executor/layers/fused_moe/__init__.py +51 -0
  271. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  272. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  273. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  274. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  275. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  276. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  277. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  278. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  279. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  280. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  281. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  282. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  283. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  284. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  285. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  286. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  287. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  288. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  289. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  290. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  291. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  292. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  293. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  294. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  295. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  296. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  297. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  298. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  299. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  300. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  301. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  302. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  303. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  304. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  305. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  306. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  307. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  308. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  309. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  310. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  311. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  312. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  313. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  426. vllm/model_executor/layers/fused_moe/cutlass_moe.py +180 -0
  427. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +294 -0
  428. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +374 -0
  429. vllm/model_executor/layers/fused_moe/fused_moe.py +1539 -0
  430. vllm/model_executor/layers/fused_moe/layer.py +949 -0
  431. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +243 -0
  432. vllm/model_executor/layers/fused_moe/moe_pallas.py +64 -0
  433. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +59 -0
  434. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +416 -0
  435. vllm/model_executor/layers/fused_moe/utils.py +48 -0
  436. vllm/model_executor/layers/layernorm.py +277 -0
  437. vllm/model_executor/layers/lightning_attn.py +651 -0
  438. vllm/model_executor/layers/linear.py +1518 -0
  439. vllm/model_executor/layers/logits_processor.py +196 -0
  440. vllm/model_executor/layers/mamba/__init__.py +0 -0
  441. vllm/model_executor/layers/mamba/mamba2_metadata.py +109 -0
  442. vllm/model_executor/layers/mamba/mamba_mixer.py +244 -0
  443. vllm/model_executor/layers/mamba/mamba_mixer2.py +538 -0
  444. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  445. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
  446. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +415 -0
  447. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +261 -0
  448. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +588 -0
  449. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +750 -0
  450. vllm/model_executor/layers/mamba/ops/ssd_combined.py +231 -0
  451. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +205 -0
  452. vllm/model_executor/layers/pooler.py +336 -0
  453. vllm/model_executor/layers/quantization/__init__.py +153 -0
  454. vllm/model_executor/layers/quantization/aqlm.py +374 -0
  455. vllm/model_executor/layers/quantization/awq.py +184 -0
  456. vllm/model_executor/layers/quantization/awq_marlin.py +518 -0
  457. vllm/model_executor/layers/quantization/awq_triton.py +319 -0
  458. vllm/model_executor/layers/quantization/base_config.py +145 -0
  459. vllm/model_executor/layers/quantization/bitblas.py +459 -0
  460. vllm/model_executor/layers/quantization/bitsandbytes.py +396 -0
  461. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  462. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +624 -0
  463. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1100 -0
  464. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +20 -0
  465. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +357 -0
  466. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +54 -0
  467. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +159 -0
  468. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +119 -0
  469. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +149 -0
  470. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +110 -0
  471. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +200 -0
  472. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +205 -0
  473. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +213 -0
  474. vllm/model_executor/layers/quantization/deepspeedfp.py +193 -0
  475. vllm/model_executor/layers/quantization/experts_int8.py +194 -0
  476. vllm/model_executor/layers/quantization/fbgemm_fp8.py +168 -0
  477. vllm/model_executor/layers/quantization/fp8.py +832 -0
  478. vllm/model_executor/layers/quantization/gguf.py +408 -0
  479. vllm/model_executor/layers/quantization/gptq.py +276 -0
  480. vllm/model_executor/layers/quantization/gptq_bitblas.py +438 -0
  481. vllm/model_executor/layers/quantization/gptq_marlin.py +643 -0
  482. vllm/model_executor/layers/quantization/gptq_marlin_24.py +295 -0
  483. vllm/model_executor/layers/quantization/hqq_marlin.py +328 -0
  484. vllm/model_executor/layers/quantization/ipex_quant.py +250 -0
  485. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  486. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +89 -0
  487. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +82 -0
  488. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  489. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +299 -0
  490. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +142 -0
  491. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +119 -0
  492. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +132 -0
  493. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +66 -0
  494. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +86 -0
  495. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +119 -0
  496. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +136 -0
  497. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +40 -0
  498. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  499. vllm/model_executor/layers/quantization/kv_cache.py +137 -0
  500. vllm/model_executor/layers/quantization/marlin.py +259 -0
  501. vllm/model_executor/layers/quantization/modelopt.py +410 -0
  502. vllm/model_executor/layers/quantization/moe_wna16.py +447 -0
  503. vllm/model_executor/layers/quantization/neuron_quant.py +67 -0
  504. vllm/model_executor/layers/quantization/ptpc_fp8.py +125 -0
  505. vllm/model_executor/layers/quantization/qqq.py +273 -0
  506. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  507. vllm/model_executor/layers/quantization/quark/quark.py +385 -0
  508. vllm/model_executor/layers/quantization/quark/quark_moe.py +236 -0
  509. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +7 -0
  510. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +54 -0
  511. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +142 -0
  512. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +121 -0
  513. vllm/model_executor/layers/quantization/quark/utils.py +102 -0
  514. vllm/model_executor/layers/quantization/schema.py +85 -0
  515. vllm/model_executor/layers/quantization/torchao.py +127 -0
  516. vllm/model_executor/layers/quantization/tpu_int8.py +119 -0
  517. vllm/model_executor/layers/quantization/utils/__init__.py +5 -0
  518. vllm/model_executor/layers/quantization/utils/allspark_utils.py +51 -0
  519. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +198 -0
  520. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  521. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  522. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  523. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  524. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  525. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  526. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  527. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  528. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  529. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  530. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  531. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  532. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  533. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  534. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  535. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  536. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  537. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  538. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  539. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  540. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  541. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  542. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  543. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  544. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  545. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  546. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  547. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  548. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  549. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  550. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  551. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  552. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  553. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  554. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  555. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  556. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  557. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  558. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  559. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  560. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  561. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  562. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  563. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  564. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  565. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  566. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  567. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  568. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  569. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  570. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  571. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  572. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  573. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  574. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  575. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  576. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  577. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  578. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  579. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  580. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  581. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  582. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  583. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  584. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  585. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  586. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  587. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  588. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  589. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  590. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  591. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  592. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  593. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  594. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  595. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  596. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  597. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  598. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  599. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  600. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  601. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  602. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  603. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  604. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  605. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  606. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  607. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  608. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  609. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  610. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  611. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  612. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  613. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  614. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  615. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  616. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  617. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  618. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  619. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  620. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  621. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  622. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  623. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  624. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  625. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  626. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  627. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  628. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  629. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  630. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  631. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  632. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  633. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  634. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  635. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  636. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  637. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  638. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  639. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  640. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  641. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  642. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  643. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  644. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  645. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  646. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  647. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  648. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  649. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  650. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  651. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  652. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  653. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  654. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  655. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  656. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  657. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  658. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  659. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  660. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  661. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  662. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  663. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  664. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  665. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  666. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  667. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  668. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  669. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  670. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  671. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  672. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  673. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  674. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  675. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  676. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  677. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/fp8_utils.py +523 -0
  723. vllm/model_executor/layers/quantization/utils/gptq_utils.py +94 -0
  724. vllm/model_executor/layers/quantization/utils/int8_utils.py +459 -0
  725. vllm/model_executor/layers/quantization/utils/layer_utils.py +39 -0
  726. vllm/model_executor/layers/quantization/utils/machete_utils.py +32 -0
  727. vllm/model_executor/layers/quantization/utils/marlin_utils.py +413 -0
  728. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +110 -0
  729. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +164 -0
  730. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  731. vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +127 -0
  732. vllm/model_executor/layers/quantization/utils/quant_utils.py +571 -0
  733. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +404 -0
  734. vllm/model_executor/layers/rejection_sampler.py +400 -0
  735. vllm/model_executor/layers/resampler.py +269 -0
  736. vllm/model_executor/layers/rotary_embedding.py +1598 -0
  737. vllm/model_executor/layers/sampler.py +1221 -0
  738. vllm/model_executor/layers/spec_decode_base_sampler.py +258 -0
  739. vllm/model_executor/layers/typical_acceptance_sampler.py +172 -0
  740. vllm/model_executor/layers/utils.py +99 -0
  741. vllm/model_executor/layers/vocab_parallel_embedding.py +485 -0
  742. vllm/model_executor/model_loader/__init__.py +20 -0
  743. vllm/model_executor/model_loader/loader.py +1542 -0
  744. vllm/model_executor/model_loader/neuron.py +243 -0
  745. vllm/model_executor/model_loader/tensorizer.py +468 -0
  746. vllm/model_executor/model_loader/utils.py +171 -0
  747. vllm/model_executor/model_loader/weight_utils.py +749 -0
  748. vllm/model_executor/models/__init__.py +27 -0
  749. vllm/model_executor/models/adapters.py +247 -0
  750. vllm/model_executor/models/arctic.py +559 -0
  751. vllm/model_executor/models/aria.py +656 -0
  752. vllm/model_executor/models/aya_vision.py +461 -0
  753. vllm/model_executor/models/baichuan.py +469 -0
  754. vllm/model_executor/models/bamba.py +542 -0
  755. vllm/model_executor/models/bart.py +936 -0
  756. vllm/model_executor/models/bert.py +725 -0
  757. vllm/model_executor/models/blip.py +337 -0
  758. vllm/model_executor/models/blip2.py +717 -0
  759. vllm/model_executor/models/bloom.py +358 -0
  760. vllm/model_executor/models/chameleon.py +1135 -0
  761. vllm/model_executor/models/chatglm.py +476 -0
  762. vllm/model_executor/models/clip.py +410 -0
  763. vllm/model_executor/models/commandr.py +466 -0
  764. vllm/model_executor/models/constant_size_cache.py +136 -0
  765. vllm/model_executor/models/dbrx.py +469 -0
  766. vllm/model_executor/models/deepseek.py +484 -0
  767. vllm/model_executor/models/deepseek_mtp.py +266 -0
  768. vllm/model_executor/models/deepseek_v2.py +830 -0
  769. vllm/model_executor/models/deepseek_vl2.py +647 -0
  770. vllm/model_executor/models/eagle.py +247 -0
  771. vllm/model_executor/models/exaone.py +548 -0
  772. vllm/model_executor/models/fairseq2_llama.py +153 -0
  773. vllm/model_executor/models/falcon.py +508 -0
  774. vllm/model_executor/models/florence2.py +1102 -0
  775. vllm/model_executor/models/fuyu.py +388 -0
  776. vllm/model_executor/models/gemma.py +423 -0
  777. vllm/model_executor/models/gemma2.py +423 -0
  778. vllm/model_executor/models/gemma3.py +531 -0
  779. vllm/model_executor/models/gemma3_mm.py +716 -0
  780. vllm/model_executor/models/glm.py +22 -0
  781. vllm/model_executor/models/glm4.py +303 -0
  782. vllm/model_executor/models/glm4v.py +647 -0
  783. vllm/model_executor/models/gpt2.py +313 -0
  784. vllm/model_executor/models/gpt_bigcode.py +336 -0
  785. vllm/model_executor/models/gpt_j.py +337 -0
  786. vllm/model_executor/models/gpt_neox.py +330 -0
  787. vllm/model_executor/models/granite.py +494 -0
  788. vllm/model_executor/models/granite_speech.py +777 -0
  789. vllm/model_executor/models/granitemoe.py +435 -0
  790. vllm/model_executor/models/granitemoeshared.py +339 -0
  791. vllm/model_executor/models/gritlm.py +245 -0
  792. vllm/model_executor/models/grok1.py +560 -0
  793. vllm/model_executor/models/h2ovl.py +542 -0
  794. vllm/model_executor/models/idefics2_vision_model.py +387 -0
  795. vllm/model_executor/models/idefics3.py +767 -0
  796. vllm/model_executor/models/interfaces.py +569 -0
  797. vllm/model_executor/models/interfaces_base.py +163 -0
  798. vllm/model_executor/models/intern_vit.py +476 -0
  799. vllm/model_executor/models/internlm2.py +453 -0
  800. vllm/model_executor/models/internlm2_ve.py +146 -0
  801. vllm/model_executor/models/internvl.py +945 -0
  802. vllm/model_executor/models/jais.py +371 -0
  803. vllm/model_executor/models/jamba.py +590 -0
  804. vllm/model_executor/models/kimi_vl.py +577 -0
  805. vllm/model_executor/models/llama.py +619 -0
  806. vllm/model_executor/models/llama4.py +530 -0
  807. vllm/model_executor/models/llama_eagle.py +152 -0
  808. vllm/model_executor/models/llama_eagle3.py +232 -0
  809. vllm/model_executor/models/llava.py +869 -0
  810. vllm/model_executor/models/llava_next.py +582 -0
  811. vllm/model_executor/models/llava_next_video.py +470 -0
  812. vllm/model_executor/models/llava_onevision.py +954 -0
  813. vllm/model_executor/models/mamba.py +271 -0
  814. vllm/model_executor/models/mamba2.py +302 -0
  815. vllm/model_executor/models/mamba_cache.py +76 -0
  816. vllm/model_executor/models/medusa.py +210 -0
  817. vllm/model_executor/models/minicpm.py +592 -0
  818. vllm/model_executor/models/minicpm3.py +229 -0
  819. vllm/model_executor/models/minicpmo.py +725 -0
  820. vllm/model_executor/models/minicpmv.py +1287 -0
  821. vllm/model_executor/models/minimax_cache.py +35 -0
  822. vllm/model_executor/models/minimax_text_01.py +1261 -0
  823. vllm/model_executor/models/mistral3.py +598 -0
  824. vllm/model_executor/models/mixtral.py +485 -0
  825. vllm/model_executor/models/mixtral_quant.py +447 -0
  826. vllm/model_executor/models/mllama.py +1623 -0
  827. vllm/model_executor/models/mllama4.py +838 -0
  828. vllm/model_executor/models/mlp_speculator.py +205 -0
  829. vllm/model_executor/models/modernbert.py +325 -0
  830. vllm/model_executor/models/module_mapping.py +71 -0
  831. vllm/model_executor/models/molmo.py +1567 -0
  832. vllm/model_executor/models/moonvit.py +628 -0
  833. vllm/model_executor/models/mpt.py +329 -0
  834. vllm/model_executor/models/nemotron.py +506 -0
  835. vllm/model_executor/models/nemotron_nas.py +446 -0
  836. vllm/model_executor/models/nvlm_d.py +212 -0
  837. vllm/model_executor/models/olmo.py +390 -0
  838. vllm/model_executor/models/olmo2.py +412 -0
  839. vllm/model_executor/models/olmoe.py +449 -0
  840. vllm/model_executor/models/opt.py +410 -0
  841. vllm/model_executor/models/orion.py +356 -0
  842. vllm/model_executor/models/paligemma.py +397 -0
  843. vllm/model_executor/models/persimmon.py +342 -0
  844. vllm/model_executor/models/phi.py +354 -0
  845. vllm/model_executor/models/phi3.py +18 -0
  846. vllm/model_executor/models/phi3_small.py +463 -0
  847. vllm/model_executor/models/phi3v.py +722 -0
  848. vllm/model_executor/models/phi4mm.py +1263 -0
  849. vllm/model_executor/models/phi4mm_audio.py +1232 -0
  850. vllm/model_executor/models/phi4mm_utils.py +1883 -0
  851. vllm/model_executor/models/phimoe.py +666 -0
  852. vllm/model_executor/models/pixtral.py +1281 -0
  853. vllm/model_executor/models/plamo2.py +736 -0
  854. vllm/model_executor/models/prithvi_geospatial_mae.py +231 -0
  855. vllm/model_executor/models/qwen.py +360 -0
  856. vllm/model_executor/models/qwen2.py +552 -0
  857. vllm/model_executor/models/qwen2_5_omni_thinker.py +901 -0
  858. vllm/model_executor/models/qwen2_5_vl.py +1136 -0
  859. vllm/model_executor/models/qwen2_audio.py +402 -0
  860. vllm/model_executor/models/qwen2_moe.py +531 -0
  861. vllm/model_executor/models/qwen2_rm.py +130 -0
  862. vllm/model_executor/models/qwen2_vl.py +1409 -0
  863. vllm/model_executor/models/qwen3.py +319 -0
  864. vllm/model_executor/models/qwen3_moe.py +528 -0
  865. vllm/model_executor/models/qwen_vl.py +784 -0
  866. vllm/model_executor/models/registry.py +611 -0
  867. vllm/model_executor/models/roberta.py +332 -0
  868. vllm/model_executor/models/siglip.py +522 -0
  869. vllm/model_executor/models/skyworkr1v.py +949 -0
  870. vllm/model_executor/models/smolvlm.py +51 -0
  871. vllm/model_executor/models/solar.py +504 -0
  872. vllm/model_executor/models/stablelm.py +349 -0
  873. vllm/model_executor/models/starcoder2.py +355 -0
  874. vllm/model_executor/models/telechat2.py +139 -0
  875. vllm/model_executor/models/teleflm.py +78 -0
  876. vllm/model_executor/models/transformers.py +442 -0
  877. vllm/model_executor/models/ultravox.py +655 -0
  878. vllm/model_executor/models/utils.py +714 -0
  879. vllm/model_executor/models/vision.py +149 -0
  880. vllm/model_executor/models/whisper.py +746 -0
  881. vllm/model_executor/models/zamba2.py +1008 -0
  882. vllm/model_executor/parameter.py +458 -0
  883. vllm/model_executor/pooling_metadata.py +71 -0
  884. vllm/model_executor/sampling_metadata.py +596 -0
  885. vllm/model_executor/utils.py +53 -0
  886. vllm/multimodal/__init__.py +31 -0
  887. vllm/multimodal/audio.py +105 -0
  888. vllm/multimodal/base.py +218 -0
  889. vllm/multimodal/hasher.py +103 -0
  890. vllm/multimodal/image.py +77 -0
  891. vllm/multimodal/inputs.py +843 -0
  892. vllm/multimodal/parse.py +454 -0
  893. vllm/multimodal/processing.py +1760 -0
  894. vllm/multimodal/profiling.py +274 -0
  895. vllm/multimodal/registry.py +321 -0
  896. vllm/multimodal/utils.py +386 -0
  897. vllm/multimodal/video.py +166 -0
  898. vllm/outputs.py +521 -0
  899. vllm/platforms/__init__.py +286 -0
  900. vllm/platforms/cpu.py +182 -0
  901. vllm/platforms/cuda.py +463 -0
  902. vllm/platforms/hpu.py +94 -0
  903. vllm/platforms/interface.py +427 -0
  904. vllm/platforms/neuron.py +69 -0
  905. vllm/platforms/rocm.py +346 -0
  906. vllm/platforms/tpu.py +174 -0
  907. vllm/platforms/xpu.py +142 -0
  908. vllm/plugins/__init__.py +82 -0
  909. vllm/pooling_params.py +53 -0
  910. vllm/profiler/__init__.py +7 -0
  911. vllm/profiler/layerwise_profile.py +374 -0
  912. vllm/profiler/utils.py +147 -0
  913. vllm/prompt_adapter/__init__.py +0 -0
  914. vllm/prompt_adapter/layers.py +82 -0
  915. vllm/prompt_adapter/models.py +357 -0
  916. vllm/prompt_adapter/request.py +36 -0
  917. vllm/prompt_adapter/utils.py +97 -0
  918. vllm/prompt_adapter/worker_manager.py +178 -0
  919. vllm/py.typed +2 -0
  920. vllm/reasoning/__init__.py +12 -0
  921. vllm/reasoning/abs_reasoning_parsers.py +189 -0
  922. vllm/reasoning/deepseek_r1_reasoning_parser.py +172 -0
  923. vllm/reasoning/granite_reasoning_parser.py +362 -0
  924. vllm/sampling_params.py +598 -0
  925. vllm/scalar_type.py +335 -0
  926. vllm/scripts.py +14 -0
  927. vllm/sequence.py +1486 -0
  928. vllm/spec_decode/__init__.py +0 -0
  929. vllm/spec_decode/batch_expansion.py +505 -0
  930. vllm/spec_decode/draft_model_runner.py +335 -0
  931. vllm/spec_decode/interfaces.py +98 -0
  932. vllm/spec_decode/medusa_worker.py +137 -0
  933. vllm/spec_decode/metrics.py +212 -0
  934. vllm/spec_decode/mlp_speculator_worker.py +93 -0
  935. vllm/spec_decode/mqa_scorer.py +159 -0
  936. vllm/spec_decode/multi_step_worker.py +416 -0
  937. vllm/spec_decode/ngram_worker.py +195 -0
  938. vllm/spec_decode/proposer_worker_base.py +58 -0
  939. vllm/spec_decode/smaller_tp_proposer_worker.py +194 -0
  940. vllm/spec_decode/spec_decode_worker.py +1324 -0
  941. vllm/spec_decode/target_model_runner.py +44 -0
  942. vllm/spec_decode/top1_proposer.py +274 -0
  943. vllm/spec_decode/util.py +276 -0
  944. vllm/test_utils.py +129 -0
  945. vllm/third_party/__init__.py +0 -0
  946. vllm/third_party/pynvml.py +6139 -0
  947. vllm/tracing.py +130 -0
  948. vllm/transformers_utils/__init__.py +19 -0
  949. vllm/transformers_utils/config.py +813 -0
  950. vllm/transformers_utils/configs/__init__.py +52 -0
  951. vllm/transformers_utils/configs/arctic.py +206 -0
  952. vllm/transformers_utils/configs/chatglm.py +71 -0
  953. vllm/transformers_utils/configs/cohere2.py +194 -0
  954. vllm/transformers_utils/configs/dbrx.py +280 -0
  955. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  956. vllm/transformers_utils/configs/eagle.py +65 -0
  957. vllm/transformers_utils/configs/exaone.py +191 -0
  958. vllm/transformers_utils/configs/falcon.py +89 -0
  959. vllm/transformers_utils/configs/h2ovl.py +15 -0
  960. vllm/transformers_utils/configs/internvl.py +53 -0
  961. vllm/transformers_utils/configs/jais.py +237 -0
  962. vllm/transformers_utils/configs/kimi_vl.py +36 -0
  963. vllm/transformers_utils/configs/medusa.py +62 -0
  964. vllm/transformers_utils/configs/mllama.py +30 -0
  965. vllm/transformers_utils/configs/mlp_speculator.py +67 -0
  966. vllm/transformers_utils/configs/moonvit.py +32 -0
  967. vllm/transformers_utils/configs/mpt.py +179 -0
  968. vllm/transformers_utils/configs/nemotron.py +204 -0
  969. vllm/transformers_utils/configs/nvlm_d.py +14 -0
  970. vllm/transformers_utils/configs/skyworkr1v.py +53 -0
  971. vllm/transformers_utils/configs/solar.py +246 -0
  972. vllm/transformers_utils/configs/telechat2.py +63 -0
  973. vllm/transformers_utils/configs/ultravox.py +107 -0
  974. vllm/transformers_utils/detokenizer.py +167 -0
  975. vllm/transformers_utils/detokenizer_utils.py +188 -0
  976. vllm/transformers_utils/processor.py +210 -0
  977. vllm/transformers_utils/processors/__init__.py +6 -0
  978. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  979. vllm/transformers_utils/s3_utils.py +161 -0
  980. vllm/transformers_utils/tokenizer.py +291 -0
  981. vllm/transformers_utils/tokenizer_base.py +146 -0
  982. vllm/transformers_utils/tokenizer_group.py +110 -0
  983. vllm/transformers_utils/tokenizers/__init__.py +9 -0
  984. vllm/transformers_utils/tokenizers/mistral.py +483 -0
  985. vllm/transformers_utils/utils.py +98 -0
  986. vllm/triton_utils/__init__.py +5 -0
  987. vllm/triton_utils/importing.py +53 -0
  988. vllm/usage/__init__.py +0 -0
  989. vllm/usage/usage_lib.py +255 -0
  990. vllm/utils.py +2692 -0
  991. vllm/v1/__init__.py +0 -0
  992. vllm/v1/attention/__init__.py +0 -0
  993. vllm/v1/attention/backends/__init__.py +0 -0
  994. vllm/v1/attention/backends/flash_attn.py +783 -0
  995. vllm/v1/attention/backends/flashinfer.py +638 -0
  996. vllm/v1/attention/backends/mla/__init__.py +0 -0
  997. vllm/v1/attention/backends/mla/common.py +974 -0
  998. vllm/v1/attention/backends/mla/flashmla.py +149 -0
  999. vllm/v1/attention/backends/mla/triton_mla.py +118 -0
  1000. vllm/v1/attention/backends/pallas.py +221 -0
  1001. vllm/v1/attention/backends/triton_attn.py +198 -0
  1002. vllm/v1/core/__init__.py +0 -0
  1003. vllm/v1/core/block_pool.py +281 -0
  1004. vllm/v1/core/encoder_cache_manager.py +149 -0
  1005. vllm/v1/core/kv_cache_manager.py +385 -0
  1006. vllm/v1/core/kv_cache_utils.py +744 -0
  1007. vllm/v1/core/sched/__init__.py +0 -0
  1008. vllm/v1/core/sched/interface.py +134 -0
  1009. vllm/v1/core/sched/output.py +126 -0
  1010. vllm/v1/core/sched/scheduler.py +838 -0
  1011. vllm/v1/core/sched/utils.py +22 -0
  1012. vllm/v1/core/specialized_manager.py +161 -0
  1013. vllm/v1/engine/__init__.py +166 -0
  1014. vllm/v1/engine/async_llm.py +532 -0
  1015. vllm/v1/engine/core.py +701 -0
  1016. vllm/v1/engine/core_client.py +942 -0
  1017. vllm/v1/engine/detokenizer.py +260 -0
  1018. vllm/v1/engine/exceptions.py +16 -0
  1019. vllm/v1/engine/llm_engine.py +285 -0
  1020. vllm/v1/engine/logprobs.py +198 -0
  1021. vllm/v1/engine/mm_input_cache.py +82 -0
  1022. vllm/v1/engine/output_processor.py +420 -0
  1023. vllm/v1/engine/parallel_sampling.py +132 -0
  1024. vllm/v1/engine/processor.py +387 -0
  1025. vllm/v1/executor/__init__.py +0 -0
  1026. vllm/v1/executor/abstract.py +112 -0
  1027. vllm/v1/executor/multiproc_executor.py +480 -0
  1028. vllm/v1/executor/ray_distributed_executor.py +61 -0
  1029. vllm/v1/kv_cache_interface.py +166 -0
  1030. vllm/v1/metrics/__init__.py +0 -0
  1031. vllm/v1/metrics/loggers.py +498 -0
  1032. vllm/v1/metrics/stats.py +238 -0
  1033. vllm/v1/outputs.py +111 -0
  1034. vllm/v1/request.py +178 -0
  1035. vllm/v1/sample/__init__.py +0 -0
  1036. vllm/v1/sample/metadata.py +43 -0
  1037. vllm/v1/sample/ops/__init__.py +0 -0
  1038. vllm/v1/sample/ops/bad_words.py +38 -0
  1039. vllm/v1/sample/ops/penalties.py +58 -0
  1040. vllm/v1/sample/ops/topk_topp_sampler.py +315 -0
  1041. vllm/v1/sample/rejection_sampler.py +631 -0
  1042. vllm/v1/sample/sampler.py +270 -0
  1043. vllm/v1/sample/tpu/__init__.py +0 -0
  1044. vllm/v1/sample/tpu/metadata.py +118 -0
  1045. vllm/v1/sample/tpu/sampler.py +154 -0
  1046. vllm/v1/serial_utils.py +274 -0
  1047. vllm/v1/spec_decode/__init__.py +0 -0
  1048. vllm/v1/spec_decode/eagle.py +318 -0
  1049. vllm/v1/spec_decode/metadata.py +61 -0
  1050. vllm/v1/spec_decode/metrics.py +164 -0
  1051. vllm/v1/spec_decode/ngram_proposer.py +131 -0
  1052. vllm/v1/spec_decode/utils.py +18 -0
  1053. vllm/v1/stats/__init__.py +0 -0
  1054. vllm/v1/stats/common.py +453 -0
  1055. vllm/v1/structured_output/__init__.py +113 -0
  1056. vllm/v1/structured_output/backend_guidance.py +215 -0
  1057. vllm/v1/structured_output/backend_types.py +96 -0
  1058. vllm/v1/structured_output/backend_xgrammar.py +299 -0
  1059. vllm/v1/structured_output/request.py +84 -0
  1060. vllm/v1/structured_output/utils.py +174 -0
  1061. vllm/v1/utils.py +249 -0
  1062. vllm/v1/worker/__init__.py +0 -0
  1063. vllm/v1/worker/block_table.py +87 -0
  1064. vllm/v1/worker/gpu_input_batch.py +677 -0
  1065. vllm/v1/worker/gpu_model_runner.py +1776 -0
  1066. vllm/v1/worker/gpu_worker.py +349 -0
  1067. vllm/v1/worker/lora_model_runner_mixin.py +145 -0
  1068. vllm/v1/worker/tpu_model_runner.py +1419 -0
  1069. vllm/v1/worker/tpu_worker.py +260 -0
  1070. vllm/v1/worker/utils.py +74 -0
  1071. vllm/v1/worker/worker_base.py +64 -0
  1072. vllm/version.py +40 -0
  1073. vllm/vllm_flash_attn/.gitkeep +0 -0
  1074. vllm/worker/__init__.py +0 -0
  1075. vllm/worker/cache_engine.py +144 -0
  1076. vllm/worker/cpu_enc_dec_model_runner.py +323 -0
  1077. vllm/worker/cpu_model_runner.py +668 -0
  1078. vllm/worker/cpu_pooling_model_runner.py +122 -0
  1079. vllm/worker/cpu_worker.py +400 -0
  1080. vllm/worker/enc_dec_model_runner.py +542 -0
  1081. vllm/worker/hpu_model_runner.py +2221 -0
  1082. vllm/worker/hpu_worker.py +483 -0
  1083. vllm/worker/model_runner.py +2056 -0
  1084. vllm/worker/model_runner_base.py +281 -0
  1085. vllm/worker/multi_step_hpu_worker.py +122 -0
  1086. vllm/worker/multi_step_model_runner.py +908 -0
  1087. vllm/worker/multi_step_tpu_worker.py +107 -0
  1088. vllm/worker/multi_step_worker.py +196 -0
  1089. vllm/worker/neuron_model_runner.py +336 -0
  1090. vllm/worker/neuron_worker.py +138 -0
  1091. vllm/worker/pooling_model_runner.py +200 -0
  1092. vllm/worker/tpu_model_runner.py +908 -0
  1093. vllm/worker/tpu_worker.py +332 -0
  1094. vllm/worker/utils.py +52 -0
  1095. vllm/worker/worker.py +570 -0
  1096. vllm/worker/worker_base.py +644 -0
  1097. vllm/worker/xpu_model_runner.py +603 -0
  1098. vllm/worker/xpu_worker.py +185 -0
  1099. vllm_cpu-0.8.5.post2.dist-info/METADATA +309 -0
  1100. vllm_cpu-0.8.5.post2.dist-info/RECORD +1103 -0
  1101. vllm_cpu-0.8.5.post2.dist-info/WHEEL +5 -0
  1102. vllm_cpu-0.8.5.post2.dist-info/entry_points.txt +2 -0
  1103. vllm_cpu-0.8.5.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1806 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ # Adapted from
4
+ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
5
+ import json
6
+ import re
7
+ import time
8
+ from argparse import Namespace
9
+ from typing import Annotated, Any, ClassVar, Literal, Optional, Union
10
+
11
+ import torch
12
+ from fastapi import UploadFile
13
+ from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
14
+ ValidationInfo, field_validator, model_validator)
15
+ from typing_extensions import TypeAlias
16
+
17
+ from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
18
+ from vllm.logger import init_logger
19
+ from vllm.pooling_params import PoolingParams
20
+ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
21
+ RequestOutputKind, SamplingParams)
22
+ from vllm.sequence import Logprob
23
+ from vllm.utils import random_uuid, resolve_obj_by_qualname
24
+
25
+ logger = init_logger(__name__)
26
+
27
+ # torch is mocked during docs generation,
28
+ # so we have to provide the values as literals
29
+ _MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807)
30
+ _LONG_INFO: Union["torch.iinfo", Namespace]
31
+
32
+ try:
33
+ from sphinx.ext.autodoc.mock import _MockModule
34
+
35
+ if isinstance(torch, _MockModule):
36
+ _LONG_INFO = _MOCK_LONG_INFO
37
+ else:
38
+ _LONG_INFO = torch.iinfo(torch.long)
39
+ except ModuleNotFoundError:
40
+ _LONG_INFO = torch.iinfo(torch.long)
41
+
42
+ assert _LONG_INFO.min == _MOCK_LONG_INFO.min
43
+ assert _LONG_INFO.max == _MOCK_LONG_INFO.max
44
+
45
+
46
+ class OpenAIBaseModel(BaseModel):
47
+ # OpenAI API does allow extra fields
48
+ model_config = ConfigDict(extra="allow")
49
+
50
+ # Cache class field names
51
+ field_names: ClassVar[Optional[set[str]]] = None
52
+
53
+ @model_validator(mode="wrap")
54
+ @classmethod
55
+ def __log_extra_fields__(cls, data, handler):
56
+ result = handler(data)
57
+ if not isinstance(data, dict):
58
+ return result
59
+ field_names = cls.field_names
60
+ if field_names is None:
61
+ # Get all class field names and their potential aliases
62
+ field_names = set()
63
+ for field_name, field in cls.model_fields.items():
64
+ field_names.add(field_name)
65
+ if alias := getattr(field, "alias", None):
66
+ field_names.add(alias)
67
+ cls.field_names = field_names
68
+
69
+ # Compare against both field names and aliases
70
+ if any(k not in field_names for k in data):
71
+ logger.warning(
72
+ "The following fields were present in the request "
73
+ "but ignored: %s",
74
+ data.keys() - field_names,
75
+ )
76
+ return result
77
+
78
+
79
+ class ErrorResponse(OpenAIBaseModel):
80
+ object: str = "error"
81
+ message: str
82
+ type: str
83
+ param: Optional[str] = None
84
+ code: int
85
+
86
+
87
+ class ModelPermission(OpenAIBaseModel):
88
+ id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
89
+ object: str = "model_permission"
90
+ created: int = Field(default_factory=lambda: int(time.time()))
91
+ allow_create_engine: bool = False
92
+ allow_sampling: bool = True
93
+ allow_logprobs: bool = True
94
+ allow_search_indices: bool = False
95
+ allow_view: bool = True
96
+ allow_fine_tuning: bool = False
97
+ organization: str = "*"
98
+ group: Optional[str] = None
99
+ is_blocking: bool = False
100
+
101
+
102
+ class ModelCard(OpenAIBaseModel):
103
+ id: str
104
+ object: str = "model"
105
+ created: int = Field(default_factory=lambda: int(time.time()))
106
+ owned_by: str = "vllm"
107
+ root: Optional[str] = None
108
+ parent: Optional[str] = None
109
+ max_model_len: Optional[int] = None
110
+ permission: list[ModelPermission] = Field(default_factory=list)
111
+
112
+
113
+ class ModelList(OpenAIBaseModel):
114
+ object: str = "list"
115
+ data: list[ModelCard] = Field(default_factory=list)
116
+
117
+
118
+ class PromptTokenUsageInfo(OpenAIBaseModel):
119
+ cached_tokens: Optional[int] = None
120
+
121
+
122
+ class UsageInfo(OpenAIBaseModel):
123
+ prompt_tokens: int = 0
124
+ total_tokens: int = 0
125
+ completion_tokens: Optional[int] = 0
126
+ prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
127
+
128
+
129
+ class RequestResponseMetadata(BaseModel):
130
+ request_id: str
131
+ final_usage_info: Optional[UsageInfo] = None
132
+
133
+
134
+ class JsonSchemaResponseFormat(OpenAIBaseModel):
135
+ name: str
136
+ description: Optional[str] = None
137
+ # schema is the field in openai but that causes conflicts with pydantic so
138
+ # instead use json_schema with an alias
139
+ json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
140
+ strict: Optional[bool] = None
141
+
142
+
143
+ class StructuralTag(OpenAIBaseModel):
144
+ begin: str
145
+ # schema is the field, but that causes conflicts with pydantic so
146
+ # instead use structural_tag_schema with an alias
147
+ structural_tag_schema: Optional[dict[str, Any]] = Field(default=None,
148
+ alias="schema")
149
+ end: str
150
+
151
+
152
+ class StructuralTagResponseFormat(OpenAIBaseModel):
153
+ type: Literal["structural_tag"]
154
+ structures: list[StructuralTag]
155
+ triggers: list[str]
156
+
157
+
158
+ class ResponseFormat(OpenAIBaseModel):
159
+ # type must be "json_schema", "json_object", or "text"
160
+ type: Literal["text", "json_object", "json_schema"]
161
+ json_schema: Optional[JsonSchemaResponseFormat] = None
162
+
163
+
164
+ AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
165
+
166
+
167
+ class StreamOptions(OpenAIBaseModel):
168
+ include_usage: Optional[bool] = True
169
+ continuous_usage_stats: Optional[bool] = False
170
+
171
+
172
+ class FunctionDefinition(OpenAIBaseModel):
173
+ name: str
174
+ description: Optional[str] = None
175
+ parameters: Optional[dict[str, Any]] = None
176
+
177
+
178
+ class ChatCompletionToolsParam(OpenAIBaseModel):
179
+ type: Literal["function"] = "function"
180
+ function: FunctionDefinition
181
+
182
+
183
+ class ChatCompletionNamedFunction(OpenAIBaseModel):
184
+ name: str
185
+
186
+
187
+ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
188
+ function: ChatCompletionNamedFunction
189
+ type: Literal["function"] = "function"
190
+
191
+
192
+ class LogitsProcessorConstructor(BaseModel):
193
+ qualname: str
194
+ args: Optional[list[Any]] = None
195
+ kwargs: Optional[dict[str, Any]] = None
196
+
197
+
198
+ LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
199
+
200
+
201
+ def get_logits_processors(processors: Optional[LogitsProcessors],
202
+ pattern: Optional[str]) -> Optional[list[Any]]:
203
+ if processors and pattern:
204
+ logits_processors = []
205
+ for processor in processors:
206
+ qualname = processor if isinstance(processor,
207
+ str) else processor.qualname
208
+ if not re.match(pattern, qualname):
209
+ raise ValueError(
210
+ f"Logits processor '{qualname}' is not allowed by this "
211
+ "server. See --logits-processor-pattern engine argument "
212
+ "for more information.")
213
+ try:
214
+ logits_processor = resolve_obj_by_qualname(qualname)
215
+ except Exception as e:
216
+ raise ValueError(
217
+ f"Logits processor '{qualname}' could not be resolved: {e}"
218
+ ) from e
219
+ if isinstance(processor, LogitsProcessorConstructor):
220
+ logits_processor = logits_processor(*processor.args or [],
221
+ **processor.kwargs or {})
222
+ logits_processors.append(logits_processor)
223
+ return logits_processors
224
+ elif processors:
225
+ raise ValueError(
226
+ "The `logits_processors` argument is not supported by this "
227
+ "server. See --logits-processor-pattern engine argugment "
228
+ "for more information.")
229
+ return None
230
+
231
+
232
+ class ChatCompletionRequest(OpenAIBaseModel):
233
+ # Ordered by official OpenAI API documentation
234
+ # https://platform.openai.com/docs/api-reference/chat/create
235
+ messages: list[ChatCompletionMessageParam]
236
+ model: Optional[str] = None
237
+ frequency_penalty: Optional[float] = 0.0
238
+ logit_bias: Optional[dict[str, float]] = None
239
+ logprobs: Optional[bool] = False
240
+ top_logprobs: Optional[int] = 0
241
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
242
+ max_tokens: Optional[int] = Field(
243
+ default=None,
244
+ deprecated=
245
+ 'max_tokens is deprecated in favor of the max_completion_tokens field')
246
+ max_completion_tokens: Optional[int] = None
247
+ n: Optional[int] = 1
248
+ presence_penalty: Optional[float] = 0.0
249
+ response_format: Optional[AnyResponseFormat] = None
250
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
251
+ stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
252
+ stream: Optional[bool] = False
253
+ stream_options: Optional[StreamOptions] = None
254
+ temperature: Optional[float] = None
255
+ top_p: Optional[float] = None
256
+ tools: Optional[list[ChatCompletionToolsParam]] = None
257
+ tool_choice: Optional[Union[
258
+ Literal["none"],
259
+ Literal["auto"],
260
+ Literal["required"],
261
+ ChatCompletionNamedToolChoiceParam,
262
+ ]] = "none"
263
+
264
+ # NOTE this will be ignored by vLLM -- the model determines the behavior
265
+ parallel_tool_calls: Optional[bool] = False
266
+ user: Optional[str] = None
267
+
268
+ # doc: begin-chat-completion-sampling-params
269
+ best_of: Optional[int] = None
270
+ use_beam_search: bool = False
271
+ top_k: Optional[int] = None
272
+ min_p: Optional[float] = None
273
+ repetition_penalty: Optional[float] = None
274
+ length_penalty: float = 1.0
275
+ stop_token_ids: Optional[list[int]] = Field(default_factory=list)
276
+ include_stop_str_in_output: bool = False
277
+ ignore_eos: bool = False
278
+ min_tokens: int = 0
279
+ skip_special_tokens: bool = True
280
+ spaces_between_special_tokens: bool = True
281
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
282
+ prompt_logprobs: Optional[int] = None
283
+ # doc: end-chat-completion-sampling-params
284
+
285
+ # doc: begin-chat-completion-extra-params
286
+ echo: bool = Field(
287
+ default=False,
288
+ description=(
289
+ "If true, the new message will be prepended with the last message "
290
+ "if they belong to the same role."),
291
+ )
292
+ add_generation_prompt: bool = Field(
293
+ default=True,
294
+ description=
295
+ ("If true, the generation prompt will be added to the chat template. "
296
+ "This is a parameter used by chat template in tokenizer config of the "
297
+ "model."),
298
+ )
299
+ continue_final_message: bool = Field(
300
+ default=False,
301
+ description=
302
+ ("If this is set, the chat will be formatted so that the final "
303
+ "message in the chat is open-ended, without any EOS tokens. The "
304
+ "model will continue this message rather than starting a new one. "
305
+ "This allows you to \"prefill\" part of the model's response for it. "
306
+ "Cannot be used at the same time as `add_generation_prompt`."),
307
+ )
308
+ add_special_tokens: bool = Field(
309
+ default=False,
310
+ description=(
311
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
312
+ "on top of what is added by the chat template. "
313
+ "For most models, the chat template takes care of adding the "
314
+ "special tokens so this should be set to false (as is the "
315
+ "default)."),
316
+ )
317
+ documents: Optional[list[dict[str, str]]] = Field(
318
+ default=None,
319
+ description=
320
+ ("A list of dicts representing documents that will be accessible to "
321
+ "the model if it is performing RAG (retrieval-augmented generation)."
322
+ " If the template does not support RAG, this argument will have no "
323
+ "effect. We recommend that each document should be a dict containing "
324
+ "\"title\" and \"text\" keys."),
325
+ )
326
+ chat_template: Optional[str] = Field(
327
+ default=None,
328
+ description=(
329
+ "A Jinja template to use for this conversion. "
330
+ "As of transformers v4.44, default chat template is no longer "
331
+ "allowed, so you must provide a chat template if the tokenizer "
332
+ "does not define one."),
333
+ )
334
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
335
+ default=None,
336
+ description=("Additional kwargs to pass to the template renderer. "
337
+ "Will be accessible by the chat template."),
338
+ )
339
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
340
+ default=None,
341
+ description=("Additional kwargs to pass to the HF processor."),
342
+ )
343
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
344
+ default=None,
345
+ description=("If specified, the output will follow the JSON schema."),
346
+ )
347
+ guided_regex: Optional[str] = Field(
348
+ default=None,
349
+ description=(
350
+ "If specified, the output will follow the regex pattern."),
351
+ )
352
+ guided_choice: Optional[list[str]] = Field(
353
+ default=None,
354
+ description=(
355
+ "If specified, the output will be exactly one of the choices."),
356
+ )
357
+ guided_grammar: Optional[str] = Field(
358
+ default=None,
359
+ description=(
360
+ "If specified, the output will follow the context free grammar."),
361
+ )
362
+ structural_tag: Optional[str] = Field(
363
+ default=None,
364
+ description=(
365
+ "If specified, the output will follow the structural tag schema."),
366
+ )
367
+ guided_decoding_backend: Optional[str] = Field(
368
+ default=None,
369
+ description=(
370
+ "If specified, will override the default guided decoding backend "
371
+ "of the server for this specific request. If set, must be either "
372
+ "'outlines' / 'lm-format-enforcer'"),
373
+ )
374
+ guided_whitespace_pattern: Optional[str] = Field(
375
+ default=None,
376
+ description=(
377
+ "If specified, will override the default whitespace pattern "
378
+ "for guided json decoding."),
379
+ )
380
+ priority: int = Field(
381
+ default=0,
382
+ description=(
383
+ "The priority of the request (lower means earlier handling; "
384
+ "default: 0). Any priority other than 0 will raise an error "
385
+ "if the served model does not use priority scheduling."),
386
+ )
387
+ request_id: str = Field(
388
+ default_factory=lambda: f"{random_uuid()}",
389
+ description=(
390
+ "The request_id related to this request. If the caller does "
391
+ "not set it, a random_uuid will be generated. This id is used "
392
+ "through out the inference process and return in response."),
393
+ )
394
+ logits_processors: Optional[LogitsProcessors] = Field(
395
+ default=None,
396
+ description=(
397
+ "A list of either qualified names of logits processors, or "
398
+ "constructor objects, to apply when sampling. A constructor is "
399
+ "a JSON object with a required 'qualname' field specifying the "
400
+ "qualified name of the processor class/factory, and optional "
401
+ "'args' and 'kwargs' fields containing positional and keyword "
402
+ "arguments. For example: {'qualname': "
403
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
404
+ "{'param': 'value'}}."))
405
+ return_tokens_as_token_ids: Optional[bool] = Field(
406
+ default=None,
407
+ description=(
408
+ "If specified with 'logprobs', tokens are represented "
409
+ " as strings of the form 'token_id:{token_id}' so that tokens "
410
+ "that are not JSON-encodable can be identified."))
411
+
412
+ # doc: end-chat-completion-extra-params
413
+
414
+ # Default sampling parameters for chat completion requests
415
+ _DEFAULT_SAMPLING_PARAMS: dict = {
416
+ "repetition_penalty": 1.0,
417
+ "temperature": 1.0,
418
+ "top_p": 1.0,
419
+ "top_k": -1,
420
+ "min_p": 0.0,
421
+ }
422
+
423
+ def to_beam_search_params(
424
+ self,
425
+ default_max_tokens: int,
426
+ default_sampling_params: Optional[dict] = None
427
+ ) -> BeamSearchParams:
428
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
429
+ max_tokens = self.max_completion_tokens or self.max_tokens
430
+
431
+ if default_sampling_params is None:
432
+ default_sampling_params = {}
433
+ n = self.n if self.n is not None else 1
434
+
435
+ # Use minimum of context window, user request & server limit.
436
+ max_tokens = min(
437
+ val for val in (default_max_tokens, max_tokens,
438
+ default_sampling_params.get("max_tokens", None))
439
+ if val is not None)
440
+
441
+ if (temperature := self.temperature) is None:
442
+ temperature = default_sampling_params.get(
443
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
444
+
445
+ return BeamSearchParams(
446
+ beam_width=n,
447
+ max_tokens=max_tokens,
448
+ ignore_eos=self.ignore_eos,
449
+ temperature=temperature,
450
+ length_penalty=self.length_penalty,
451
+ include_stop_str_in_output=self.include_stop_str_in_output,
452
+ )
453
+
454
+ def to_sampling_params(
455
+ self,
456
+ default_max_tokens: int,
457
+ logits_processor_pattern: Optional[str],
458
+ default_sampling_params: Optional[dict] = None,
459
+ ) -> SamplingParams:
460
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
461
+ max_tokens = self.max_completion_tokens or self.max_tokens
462
+
463
+ if default_sampling_params is None:
464
+ default_sampling_params = {}
465
+
466
+ # Use minimum of context window, user request & server limit.
467
+ max_tokens = min(
468
+ val for val in (default_max_tokens, max_tokens,
469
+ default_sampling_params.get("max_tokens", None))
470
+ if val is not None)
471
+
472
+ # Default parameters
473
+ if (repetition_penalty := self.repetition_penalty) is None:
474
+ repetition_penalty = default_sampling_params.get(
475
+ "repetition_penalty",
476
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
477
+ )
478
+ if (temperature := self.temperature) is None:
479
+ temperature = default_sampling_params.get(
480
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
481
+ if (top_p := self.top_p) is None:
482
+ top_p = default_sampling_params.get(
483
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
484
+ if (top_k := self.top_k) is None:
485
+ top_k = default_sampling_params.get(
486
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
487
+ if (min_p := self.min_p) is None:
488
+ min_p = default_sampling_params.get(
489
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
490
+
491
+ prompt_logprobs = self.prompt_logprobs
492
+ if prompt_logprobs is None and self.echo:
493
+ prompt_logprobs = self.top_logprobs
494
+
495
+ guided_json_object = None
496
+ if self.response_format is not None:
497
+ if self.response_format.type == "json_object":
498
+ guided_json_object = True
499
+ elif self.response_format.type == "json_schema":
500
+ json_schema = self.response_format.json_schema
501
+ assert json_schema is not None
502
+ self.guided_json = json_schema.json_schema
503
+ elif self.response_format.type == "structural_tag":
504
+ structural_tag = self.response_format
505
+ assert structural_tag is not None and isinstance(
506
+ structural_tag, StructuralTagResponseFormat)
507
+ s_tag_obj = structural_tag.model_dump(by_alias=True)
508
+ self.structural_tag = json.dumps(s_tag_obj)
509
+
510
+ guided_decoding = GuidedDecodingParams.from_optional(
511
+ json=self._get_guided_json_from_tool() or self.guided_json,
512
+ regex=self.guided_regex,
513
+ choice=self.guided_choice,
514
+ grammar=self.guided_grammar,
515
+ json_object=guided_json_object,
516
+ backend=self.guided_decoding_backend,
517
+ whitespace_pattern=self.guided_whitespace_pattern,
518
+ structural_tag=self.structural_tag,
519
+ )
520
+
521
+ return SamplingParams.from_optional(
522
+ n=self.n,
523
+ best_of=self.best_of,
524
+ presence_penalty=self.presence_penalty,
525
+ frequency_penalty=self.frequency_penalty,
526
+ repetition_penalty=repetition_penalty,
527
+ temperature=temperature,
528
+ top_p=top_p,
529
+ top_k=top_k,
530
+ min_p=min_p,
531
+ seed=self.seed,
532
+ stop=self.stop,
533
+ stop_token_ids=self.stop_token_ids,
534
+ logprobs=self.top_logprobs if self.logprobs else None,
535
+ prompt_logprobs=prompt_logprobs,
536
+ ignore_eos=self.ignore_eos,
537
+ max_tokens=max_tokens,
538
+ min_tokens=self.min_tokens,
539
+ skip_special_tokens=self.skip_special_tokens,
540
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
541
+ logits_processors=get_logits_processors(self.logits_processors,
542
+ logits_processor_pattern),
543
+ include_stop_str_in_output=self.include_stop_str_in_output,
544
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
545
+ output_kind=RequestOutputKind.DELTA if self.stream \
546
+ else RequestOutputKind.FINAL_ONLY,
547
+ guided_decoding=guided_decoding,
548
+ logit_bias=self.logit_bias)
549
+
550
+ def _get_guided_json_from_tool(
551
+ self) -> Optional[Union[str, dict, BaseModel]]:
552
+ # user has chosen to not use any tool
553
+ if self.tool_choice == "none" or self.tools is None:
554
+ return None
555
+
556
+ # user has chosen to use a named tool
557
+ if type(self.tool_choice) is ChatCompletionNamedToolChoiceParam:
558
+ tool_name = self.tool_choice.function.name
559
+ tools = {tool.function.name: tool.function for tool in self.tools}
560
+ if tool_name not in tools:
561
+ raise ValueError(
562
+ f"Tool '{tool_name}' has not been passed in `tools`.")
563
+ tool = tools[tool_name]
564
+ return tool.parameters
565
+
566
+ if self.tool_choice == "required":
567
+ # Pydantic schema generation cannot be used since the JSON schema
568
+ # has to be constructed for a specific instantiation of a tool list
569
+ # so that parameters of a function are correctly generated
570
+ # based on the chosen function name
571
+ def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
572
+ return {
573
+ "properties": {
574
+ "name": {
575
+ "type": "string",
576
+ "enum": [tool.function.name]
577
+ },
578
+ # parameters are always generated as '{}' in the final
579
+ # output if they are missing from the request
580
+ # (i.e. are None or '{}') so the schema is
581
+ # updated to produce an empty object in that case
582
+ "parameters": tool.function.parameters
583
+ if tool.function.parameters else {
584
+ "type": "object",
585
+ "properties": {}
586
+ }
587
+ },
588
+ "required": ["name", "parameters"]
589
+ }
590
+
591
+ json_schema = {
592
+ "type": "array",
593
+ "minItems": 1,
594
+ "items": {
595
+ "type": "object",
596
+ "anyOf": [get_tool_schema(tool) for tool in self.tools]
597
+ }
598
+ }
599
+ return json_schema
600
+
601
+ return None
602
+
603
+ @model_validator(mode="before")
604
+ @classmethod
605
+ def validate_stream_options(cls, data):
606
+ if data.get("stream_options") and not data.get("stream"):
607
+ raise ValueError(
608
+ "Stream options can only be defined when `stream=True`.")
609
+
610
+ return data
611
+
612
+ @model_validator(mode="before")
613
+ @classmethod
614
+ def check_logprobs(cls, data):
615
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
616
+ if data.get("stream") and prompt_logprobs > 0:
617
+ raise ValueError(
618
+ "`prompt_logprobs` are not available when `stream=True`.")
619
+
620
+ if prompt_logprobs < 0:
621
+ raise ValueError("`prompt_logprobs` must be a positive value.")
622
+
623
+ if (top_logprobs := data.get("top_logprobs")) is not None:
624
+ if top_logprobs < 0:
625
+ raise ValueError("`top_logprobs` must be a positive value.")
626
+
627
+ if top_logprobs > 0 and not data.get("logprobs"):
628
+ raise ValueError(
629
+ "when using `top_logprobs`, `logprobs` must be set to true."
630
+ )
631
+
632
+ return data
633
+
634
+ @model_validator(mode="before")
635
+ @classmethod
636
+ def check_guided_decoding_count(cls, data):
637
+ if isinstance(data, ValueError):
638
+ raise data
639
+
640
+ guide_count = sum([
641
+ "guided_json" in data and data["guided_json"] is not None,
642
+ "guided_regex" in data and data["guided_regex"] is not None,
643
+ "guided_choice" in data and data["guided_choice"] is not None
644
+ ])
645
+ # you can only use one kind of guided decoding
646
+ if guide_count > 1:
647
+ raise ValueError(
648
+ "You can only use one kind of guided decoding "
649
+ "('guided_json', 'guided_regex' or 'guided_choice').")
650
+ # you can only either use guided decoding or tools, not both
651
+ if guide_count > 1 and data.get("tool_choice", "none") not in (
652
+ "none",
653
+ "auto",
654
+ "required",
655
+ ):
656
+ raise ValueError(
657
+ "You can only either use guided decoding or tools, not both.")
658
+ return data
659
+
660
+ @model_validator(mode="before")
661
+ @classmethod
662
+ def check_tool_usage(cls, data):
663
+
664
+ # if "tool_choice" is not specified but tools are provided,
665
+ # default to "auto" tool_choice
666
+ if "tool_choice" not in data and data.get("tools"):
667
+ data["tool_choice"] = "auto"
668
+
669
+ # if "tool_choice" is "none" -- ignore tools if present
670
+ if "tool_choice" in data and data["tool_choice"] == "none":
671
+ # ensure that no tools are present
672
+ data.pop("tools", None)
673
+ return data
674
+
675
+ # if "tool_choice" is specified -- validation
676
+ if "tool_choice" in data:
677
+
678
+ # ensure that if "tool choice" is specified, tools are present
679
+ if "tools" not in data or data["tools"] is None:
680
+ raise ValueError(
681
+ "When using `tool_choice`, `tools` must be set.")
682
+
683
+ # make sure that tool choice is either a named tool
684
+ # OR that it's set to "auto" or "required"
685
+ if data["tool_choice"] not in [
686
+ "auto", "required"
687
+ ] and not isinstance(data["tool_choice"], dict):
688
+ raise NotImplementedError(
689
+ f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
690
+ 'Only named tools, "none", "auto" or "required" '\
691
+ 'are supported.'
692
+ )
693
+
694
+ # ensure that if "tool_choice" is specified as an object,
695
+ # it matches a valid tool
696
+ if isinstance(data["tool_choice"], dict):
697
+ valid_tool = False
698
+ specified_function = data["tool_choice"].get("function")
699
+ if not specified_function:
700
+ raise ValueError(
701
+ "Expected field `function` in `tool_choice`."
702
+ " Correct usage: `{\"type\": \"function\","
703
+ " \"function\": {\"name\": \"my_function\"}}`")
704
+ specified_function_name = specified_function.get("name")
705
+ if not specified_function_name:
706
+ raise ValueError(
707
+ "Expected field `name` in `function` in `tool_choice`."
708
+ "Correct usage: `{\"type\": \"function\", "
709
+ "\"function\": {\"name\": \"my_function\"}}`")
710
+ for tool in data["tools"]:
711
+ if tool["function"]["name"] == specified_function_name:
712
+ valid_tool = True
713
+ break
714
+ if not valid_tool:
715
+ raise ValueError(
716
+ "The tool specified in `tool_choice` does not match any"
717
+ " of the specified `tools`")
718
+ return data
719
+
720
+ @model_validator(mode="before")
721
+ @classmethod
722
+ def check_generation_prompt(cls, data):
723
+ if data.get("continue_final_message") and data.get(
724
+ "add_generation_prompt"):
725
+ raise ValueError("Cannot set both `continue_final_message` and "
726
+ "`add_generation_prompt` to True.")
727
+ return data
728
+
729
+
730
+ class CompletionRequest(OpenAIBaseModel):
731
+ # Ordered by official OpenAI API documentation
732
+ # https://platform.openai.com/docs/api-reference/completions/create
733
+ model: Optional[str] = None
734
+ prompt: Union[list[int], list[list[int]], str, list[str]]
735
+ best_of: Optional[int] = None
736
+ echo: Optional[bool] = False
737
+ frequency_penalty: Optional[float] = 0.0
738
+ logit_bias: Optional[dict[str, float]] = None
739
+ logprobs: Optional[int] = None
740
+ max_tokens: Optional[int] = 16
741
+ n: int = 1
742
+ presence_penalty: Optional[float] = 0.0
743
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
744
+ stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
745
+ stream: Optional[bool] = False
746
+ stream_options: Optional[StreamOptions] = None
747
+ suffix: Optional[str] = None
748
+ temperature: Optional[float] = None
749
+ top_p: Optional[float] = None
750
+ user: Optional[str] = None
751
+
752
+ # doc: begin-completion-sampling-params
753
+ use_beam_search: bool = False
754
+ top_k: Optional[int] = None
755
+ min_p: Optional[float] = None
756
+ repetition_penalty: Optional[float] = None
757
+ length_penalty: float = 1.0
758
+ stop_token_ids: Optional[list[int]] = Field(default_factory=list)
759
+ include_stop_str_in_output: bool = False
760
+ ignore_eos: bool = False
761
+ min_tokens: int = 0
762
+ skip_special_tokens: bool = True
763
+ spaces_between_special_tokens: bool = True
764
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
765
+ allowed_token_ids: Optional[list[int]] = None
766
+ prompt_logprobs: Optional[int] = None
767
+ # doc: end-completion-sampling-params
768
+
769
+ # doc: begin-completion-extra-params
770
+ add_special_tokens: bool = Field(
771
+ default=True,
772
+ description=(
773
+ "If true (the default), special tokens (e.g. BOS) will be added to "
774
+ "the prompt."),
775
+ )
776
+ response_format: Optional[AnyResponseFormat] = Field(
777
+ default=None,
778
+ description=(
779
+ "Similar to chat completion, this parameter specifies the format "
780
+ "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
781
+ ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
782
+ ),
783
+ )
784
+ guided_json: Optional[Union[str, dict, BaseModel]] = Field(
785
+ default=None,
786
+ description="If specified, the output will follow the JSON schema.",
787
+ )
788
+ guided_regex: Optional[str] = Field(
789
+ default=None,
790
+ description=(
791
+ "If specified, the output will follow the regex pattern."),
792
+ )
793
+ guided_choice: Optional[list[str]] = Field(
794
+ default=None,
795
+ description=(
796
+ "If specified, the output will be exactly one of the choices."),
797
+ )
798
+ guided_grammar: Optional[str] = Field(
799
+ default=None,
800
+ description=(
801
+ "If specified, the output will follow the context free grammar."),
802
+ )
803
+ guided_decoding_backend: Optional[str] = Field(
804
+ default=None,
805
+ description=(
806
+ "If specified, will override the default guided decoding backend "
807
+ "of the server for this specific request. If set, must be one of "
808
+ "'outlines' / 'lm-format-enforcer'"),
809
+ )
810
+ guided_whitespace_pattern: Optional[str] = Field(
811
+ default=None,
812
+ description=(
813
+ "If specified, will override the default whitespace pattern "
814
+ "for guided json decoding."),
815
+ )
816
+ priority: int = Field(
817
+ default=0,
818
+ description=(
819
+ "The priority of the request (lower means earlier handling; "
820
+ "default: 0). Any priority other than 0 will raise an error "
821
+ "if the served model does not use priority scheduling."),
822
+ )
823
+ logits_processors: Optional[LogitsProcessors] = Field(
824
+ default=None,
825
+ description=(
826
+ "A list of either qualified names of logits processors, or "
827
+ "constructor objects, to apply when sampling. A constructor is "
828
+ "a JSON object with a required 'qualname' field specifying the "
829
+ "qualified name of the processor class/factory, and optional "
830
+ "'args' and 'kwargs' fields containing positional and keyword "
831
+ "arguments. For example: {'qualname': "
832
+ "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
833
+ "{'param': 'value'}}."))
834
+
835
+ return_tokens_as_token_ids: Optional[bool] = Field(
836
+ default=None,
837
+ description=(
838
+ "If specified with 'logprobs', tokens are represented "
839
+ " as strings of the form 'token_id:{token_id}' so that tokens "
840
+ "that are not JSON-encodable can be identified."))
841
+
842
+ # doc: end-completion-extra-params
843
+
844
+ # Default sampling parameters for completion requests
845
+ _DEFAULT_SAMPLING_PARAMS: dict = {
846
+ "repetition_penalty": 1.0,
847
+ "temperature": 1.0,
848
+ "top_p": 1.0,
849
+ "top_k": -1,
850
+ "min_p": 0.0,
851
+ }
852
+
853
+ def to_beam_search_params(
854
+ self,
855
+ default_max_tokens: int,
856
+ default_sampling_params: Optional[dict] = None
857
+ ) -> BeamSearchParams:
858
+ max_tokens = self.max_tokens
859
+
860
+ if default_sampling_params is None:
861
+ default_sampling_params = {}
862
+ n = self.n if self.n is not None else 1
863
+
864
+ # Use minimum of context window, user request & server limit.
865
+ max_tokens = min(
866
+ val for val in (default_max_tokens, max_tokens,
867
+ default_sampling_params.get("max_tokens", None))
868
+ if val is not None)
869
+
870
+ if (temperature := self.temperature) is None:
871
+ temperature = default_sampling_params.get("temperature", 1.0)
872
+
873
+ return BeamSearchParams(
874
+ beam_width=n,
875
+ max_tokens=max_tokens,
876
+ ignore_eos=self.ignore_eos,
877
+ temperature=temperature,
878
+ length_penalty=self.length_penalty,
879
+ include_stop_str_in_output=self.include_stop_str_in_output,
880
+ )
881
+
882
+ def to_sampling_params(
883
+ self,
884
+ default_max_tokens: int,
885
+ logits_processor_pattern: Optional[str],
886
+ default_sampling_params: Optional[dict] = None,
887
+ ) -> SamplingParams:
888
+ max_tokens = self.max_tokens
889
+
890
+ if default_sampling_params is None:
891
+ default_sampling_params = {}
892
+
893
+ # Use minimum of context window, user request & server limit.
894
+ max_tokens = min(
895
+ val for val in (default_max_tokens, max_tokens,
896
+ default_sampling_params.get("max_tokens", None))
897
+ if val is not None)
898
+
899
+ # Default parameters
900
+ if (repetition_penalty := self.repetition_penalty) is None:
901
+ repetition_penalty = default_sampling_params.get(
902
+ "repetition_penalty",
903
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
904
+ )
905
+ if (temperature := self.temperature) is None:
906
+ temperature = default_sampling_params.get(
907
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
908
+ if (top_p := self.top_p) is None:
909
+ top_p = default_sampling_params.get(
910
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
911
+ if (top_k := self.top_k) is None:
912
+ top_k = default_sampling_params.get(
913
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
914
+ if (min_p := self.min_p) is None:
915
+ min_p = default_sampling_params.get(
916
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
917
+
918
+ prompt_logprobs = self.prompt_logprobs
919
+ if prompt_logprobs is None and self.echo:
920
+ prompt_logprobs = self.logprobs
921
+
922
+ echo_without_generation = self.echo and self.max_tokens == 0
923
+
924
+ guided_json_object = None
925
+ if (self.response_format is not None
926
+ and self.response_format.type == "json_object"):
927
+ guided_json_object = True
928
+
929
+ guided_decoding = GuidedDecodingParams.from_optional(
930
+ json=self.guided_json,
931
+ regex=self.guided_regex,
932
+ choice=self.guided_choice,
933
+ grammar=self.guided_grammar,
934
+ json_object=guided_json_object,
935
+ backend=self.guided_decoding_backend,
936
+ whitespace_pattern=self.guided_whitespace_pattern,
937
+ )
938
+
939
+ return SamplingParams.from_optional(
940
+ n=self.n,
941
+ best_of=self.best_of,
942
+ presence_penalty=self.presence_penalty,
943
+ frequency_penalty=self.frequency_penalty,
944
+ repetition_penalty=repetition_penalty,
945
+ temperature=temperature,
946
+ top_p=top_p,
947
+ top_k=top_k,
948
+ min_p=min_p,
949
+ seed=self.seed,
950
+ stop=self.stop,
951
+ stop_token_ids=self.stop_token_ids,
952
+ logprobs=self.logprobs,
953
+ ignore_eos=self.ignore_eos,
954
+ max_tokens=max_tokens if not echo_without_generation else 1,
955
+ min_tokens=self.min_tokens,
956
+ prompt_logprobs=prompt_logprobs,
957
+ skip_special_tokens=self.skip_special_tokens,
958
+ spaces_between_special_tokens=self.spaces_between_special_tokens,
959
+ include_stop_str_in_output=self.include_stop_str_in_output,
960
+ logits_processors=get_logits_processors(self.logits_processors,
961
+ logits_processor_pattern),
962
+ truncate_prompt_tokens=self.truncate_prompt_tokens,
963
+ output_kind=RequestOutputKind.DELTA if self.stream \
964
+ else RequestOutputKind.FINAL_ONLY,
965
+ guided_decoding=guided_decoding,
966
+ logit_bias=self.logit_bias,
967
+ allowed_token_ids=self.allowed_token_ids)
968
+
969
+ @model_validator(mode="before")
970
+ @classmethod
971
+ def check_guided_decoding_count(cls, data):
972
+ guide_count = sum([
973
+ "guided_json" in data and data["guided_json"] is not None,
974
+ "guided_regex" in data and data["guided_regex"] is not None,
975
+ "guided_choice" in data and data["guided_choice"] is not None
976
+ ])
977
+ if guide_count > 1:
978
+ raise ValueError(
979
+ "You can only use one kind of guided decoding "
980
+ "('guided_json', 'guided_regex' or 'guided_choice').")
981
+ return data
982
+
983
+ @model_validator(mode="before")
984
+ @classmethod
985
+ def check_logprobs(cls, data):
986
+ if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
987
+ if data.get("stream") and prompt_logprobs > 0:
988
+ raise ValueError(
989
+ "`prompt_logprobs` are not available when `stream=True`.")
990
+
991
+ if prompt_logprobs < 0:
992
+ raise ValueError("`prompt_logprobs` must be a positive value.")
993
+
994
+ if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
995
+ raise ValueError("`logprobs` must be a positive value.")
996
+
997
+ return data
998
+
999
+ @model_validator(mode="before")
1000
+ @classmethod
1001
+ def validate_stream_options(cls, data):
1002
+ if data.get("stream_options") and not data.get("stream"):
1003
+ raise ValueError(
1004
+ "Stream options can only be defined when `stream=True`.")
1005
+
1006
+ return data
1007
+
1008
+
1009
+ class EmbeddingCompletionRequest(OpenAIBaseModel):
1010
+ # Ordered by official OpenAI API documentation
1011
+ # https://platform.openai.com/docs/api-reference/embeddings
1012
+ model: Optional[str] = None
1013
+ input: Union[list[int], list[list[int]], str, list[str]]
1014
+ encoding_format: Literal["float", "base64"] = "float"
1015
+ dimensions: Optional[int] = None
1016
+ user: Optional[str] = None
1017
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
1018
+
1019
+ # doc: begin-embedding-pooling-params
1020
+ additional_data: Optional[Any] = None
1021
+ # doc: end-embedding-pooling-params
1022
+
1023
+ # doc: begin-embedding-extra-params
1024
+ add_special_tokens: bool = Field(
1025
+ default=True,
1026
+ description=(
1027
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1028
+ "the prompt."),
1029
+ )
1030
+ priority: int = Field(
1031
+ default=0,
1032
+ description=(
1033
+ "The priority of the request (lower means earlier handling; "
1034
+ "default: 0). Any priority other than 0 will raise an error "
1035
+ "if the served model does not use priority scheduling."),
1036
+ )
1037
+
1038
+ # doc: end-embedding-extra-params
1039
+
1040
+ def to_pooling_params(self):
1041
+ return PoolingParams(dimensions=self.dimensions,
1042
+ additional_data=self.additional_data)
1043
+
1044
+
1045
+ class EmbeddingChatRequest(OpenAIBaseModel):
1046
+ model: Optional[str] = None
1047
+ messages: list[ChatCompletionMessageParam]
1048
+
1049
+ encoding_format: Literal["float", "base64"] = "float"
1050
+ dimensions: Optional[int] = None
1051
+ user: Optional[str] = None
1052
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
1053
+
1054
+ # doc: begin-chat-embedding-pooling-params
1055
+ additional_data: Optional[Any] = None
1056
+ # doc: end-chat-embedding-pooling-params
1057
+
1058
+ # doc: begin-chat-embedding-extra-params
1059
+ add_special_tokens: bool = Field(
1060
+ default=False,
1061
+ description=(
1062
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1063
+ "on top of what is added by the chat template. "
1064
+ "For most models, the chat template takes care of adding the "
1065
+ "special tokens so this should be set to false (as is the "
1066
+ "default)."),
1067
+ )
1068
+ chat_template: Optional[str] = Field(
1069
+ default=None,
1070
+ description=(
1071
+ "A Jinja template to use for this conversion. "
1072
+ "As of transformers v4.44, default chat template is no longer "
1073
+ "allowed, so you must provide a chat template if the tokenizer "
1074
+ "does not define one."),
1075
+ )
1076
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1077
+ default=None,
1078
+ description=("Additional kwargs to pass to the template renderer. "
1079
+ "Will be accessible by the chat template."),
1080
+ )
1081
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1082
+ default=None,
1083
+ description=("Additional kwargs to pass to the HF processor."),
1084
+ )
1085
+ priority: int = Field(
1086
+ default=0,
1087
+ description=(
1088
+ "The priority of the request (lower means earlier handling; "
1089
+ "default: 0). Any priority other than 0 will raise an error "
1090
+ "if the served model does not use priority scheduling."),
1091
+ )
1092
+ # doc: end-chat-embedding-extra-params
1093
+
1094
+ @model_validator(mode="before")
1095
+ @classmethod
1096
+ def check_generation_prompt(cls, data):
1097
+ if data.get("continue_final_message") and data.get(
1098
+ "add_generation_prompt"):
1099
+ raise ValueError("Cannot set both `continue_final_message` and "
1100
+ "`add_generation_prompt` to True.")
1101
+ return data
1102
+
1103
+ def to_pooling_params(self):
1104
+ return PoolingParams(dimensions=self.dimensions,
1105
+ additional_data=self.additional_data)
1106
+
1107
+
1108
+ EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
1109
+
1110
+ PoolingCompletionRequest = EmbeddingCompletionRequest
1111
+ PoolingChatRequest = EmbeddingChatRequest
1112
+ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
1113
+
1114
+
1115
+ class ScoreRequest(OpenAIBaseModel):
1116
+ model: Optional[str] = None
1117
+ text_1: Union[list[str], str]
1118
+ text_2: Union[list[str], str]
1119
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
1120
+
1121
+ # doc: begin-score-pooling-params
1122
+ additional_data: Optional[Any] = None
1123
+ # doc: end-score-pooling-params
1124
+
1125
+ # doc: begin-score-extra-params
1126
+ priority: int = Field(
1127
+ default=0,
1128
+ description=(
1129
+ "The priority of the request (lower means earlier handling; "
1130
+ "default: 0). Any priority other than 0 will raise an error "
1131
+ "if the served model does not use priority scheduling."),
1132
+ )
1133
+
1134
+ # doc: end-score-extra-params
1135
+
1136
+ def to_pooling_params(self):
1137
+ return PoolingParams(additional_data=self.additional_data)
1138
+
1139
+
1140
+ class RerankRequest(OpenAIBaseModel):
1141
+ model: Optional[str] = None
1142
+ query: str
1143
+ documents: list[str]
1144
+ top_n: int = Field(default_factory=lambda: 0)
1145
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
1146
+
1147
+ # doc: begin-rerank-pooling-params
1148
+ additional_data: Optional[Any] = None
1149
+ # doc: end-rerank-pooling-params
1150
+
1151
+ # doc: begin-rerank-extra-params
1152
+ priority: int = Field(
1153
+ default=0,
1154
+ description=(
1155
+ "The priority of the request (lower means earlier handling; "
1156
+ "default: 0). Any priority other than 0 will raise an error "
1157
+ "if the served model does not use priority scheduling."),
1158
+ )
1159
+
1160
+ # doc: end-rerank-extra-params
1161
+
1162
+ def to_pooling_params(self):
1163
+ return PoolingParams(additional_data=self.additional_data)
1164
+
1165
+
1166
+ class RerankDocument(BaseModel):
1167
+ text: str
1168
+
1169
+
1170
+ class RerankResult(BaseModel):
1171
+ index: int
1172
+ document: RerankDocument
1173
+ relevance_score: float
1174
+
1175
+
1176
+ class RerankUsage(BaseModel):
1177
+ total_tokens: int
1178
+
1179
+
1180
+ class RerankResponse(OpenAIBaseModel):
1181
+ id: str
1182
+ model: str
1183
+ usage: RerankUsage
1184
+ results: list[RerankResult]
1185
+
1186
+
1187
+ class CompletionLogProbs(OpenAIBaseModel):
1188
+ text_offset: list[int] = Field(default_factory=list)
1189
+ token_logprobs: list[Optional[float]] = Field(default_factory=list)
1190
+ tokens: list[str] = Field(default_factory=list)
1191
+ top_logprobs: list[Optional[dict[str,
1192
+ float]]] = Field(default_factory=list)
1193
+
1194
+
1195
+ class CompletionResponseChoice(OpenAIBaseModel):
1196
+ index: int
1197
+ text: str
1198
+ logprobs: Optional[CompletionLogProbs] = None
1199
+ finish_reason: Optional[str] = None
1200
+ stop_reason: Optional[Union[int, str]] = Field(
1201
+ default=None,
1202
+ description=(
1203
+ "The stop string or token id that caused the completion "
1204
+ "to stop, None if the completion finished for some other reason "
1205
+ "including encountering the EOS token"),
1206
+ )
1207
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1208
+
1209
+
1210
+ class CompletionResponse(OpenAIBaseModel):
1211
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1212
+ object: str = "text_completion"
1213
+ created: int = Field(default_factory=lambda: int(time.time()))
1214
+ model: str
1215
+ choices: list[CompletionResponseChoice]
1216
+ usage: UsageInfo
1217
+
1218
+
1219
+ class CompletionResponseStreamChoice(OpenAIBaseModel):
1220
+ index: int
1221
+ text: str
1222
+ logprobs: Optional[CompletionLogProbs] = None
1223
+ finish_reason: Optional[str] = None
1224
+ stop_reason: Optional[Union[int, str]] = Field(
1225
+ default=None,
1226
+ description=(
1227
+ "The stop string or token id that caused the completion "
1228
+ "to stop, None if the completion finished for some other reason "
1229
+ "including encountering the EOS token"),
1230
+ )
1231
+
1232
+
1233
+ class CompletionStreamResponse(OpenAIBaseModel):
1234
+ id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
1235
+ object: str = "text_completion"
1236
+ created: int = Field(default_factory=lambda: int(time.time()))
1237
+ model: str
1238
+ choices: list[CompletionResponseStreamChoice]
1239
+ usage: Optional[UsageInfo] = Field(default=None)
1240
+
1241
+
1242
+ class EmbeddingResponseData(OpenAIBaseModel):
1243
+ index: int
1244
+ object: str = "embedding"
1245
+ embedding: Union[list[float], str]
1246
+
1247
+
1248
+ class EmbeddingResponse(OpenAIBaseModel):
1249
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1250
+ object: str = "list"
1251
+ created: int = Field(default_factory=lambda: int(time.time()))
1252
+ model: str
1253
+ data: list[EmbeddingResponseData]
1254
+ usage: UsageInfo
1255
+
1256
+
1257
+ class PoolingResponseData(OpenAIBaseModel):
1258
+ index: int
1259
+ object: str = "pooling"
1260
+ data: Union[list[list[float]], list[float], str]
1261
+
1262
+
1263
+ class PoolingResponse(OpenAIBaseModel):
1264
+ id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
1265
+ object: str = "list"
1266
+ created: int = Field(default_factory=lambda: int(time.time()))
1267
+ model: str
1268
+ data: list[PoolingResponseData]
1269
+ usage: UsageInfo
1270
+
1271
+
1272
+ class ScoreResponseData(OpenAIBaseModel):
1273
+ index: int
1274
+ object: str = "score"
1275
+ score: float
1276
+
1277
+
1278
+ class ScoreResponse(OpenAIBaseModel):
1279
+ id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
1280
+ object: str = "list"
1281
+ created: int = Field(default_factory=lambda: int(time.time()))
1282
+ model: str
1283
+ data: list[ScoreResponseData]
1284
+ usage: UsageInfo
1285
+
1286
+
1287
+ class FunctionCall(OpenAIBaseModel):
1288
+ name: str
1289
+ arguments: str
1290
+
1291
+
1292
+ class ToolCall(OpenAIBaseModel):
1293
+ id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
1294
+ type: Literal["function"] = "function"
1295
+ function: FunctionCall
1296
+
1297
+
1298
+ class DeltaFunctionCall(BaseModel):
1299
+ name: Optional[str] = None
1300
+ arguments: Optional[str] = None
1301
+
1302
+
1303
+ # a tool call delta where everything is optional
1304
+ class DeltaToolCall(OpenAIBaseModel):
1305
+ id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
1306
+ type: Literal["function"] = "function"
1307
+ index: int
1308
+ function: Optional[DeltaFunctionCall] = None
1309
+
1310
+
1311
+ class ExtractedToolCallInformation(BaseModel):
1312
+ # indicate if tools were called
1313
+ tools_called: bool
1314
+
1315
+ # extracted tool calls
1316
+ tool_calls: list[ToolCall]
1317
+
1318
+ # content - per OpenAI spec, content AND tool calls can be returned rarely
1319
+ # But some models will do this intentionally
1320
+ content: Optional[str] = None
1321
+
1322
+
1323
+ class ChatMessage(OpenAIBaseModel):
1324
+ role: str
1325
+ reasoning_content: Optional[str] = None
1326
+ content: Optional[str] = None
1327
+ tool_calls: list[ToolCall] = Field(default_factory=list)
1328
+
1329
+
1330
+ class ChatCompletionLogProb(OpenAIBaseModel):
1331
+ token: str
1332
+ logprob: float = -9999.0
1333
+ bytes: Optional[list[int]] = None
1334
+
1335
+
1336
+ class ChatCompletionLogProbsContent(ChatCompletionLogProb):
1337
+ # Workaround: redefine fields name cache so that it's not
1338
+ # shared with the super class.
1339
+ field_names: ClassVar[Optional[set[str]]] = None
1340
+ top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
1341
+
1342
+
1343
+ class ChatCompletionLogProbs(OpenAIBaseModel):
1344
+ content: Optional[list[ChatCompletionLogProbsContent]] = None
1345
+
1346
+
1347
+ class ChatCompletionResponseChoice(OpenAIBaseModel):
1348
+ index: int
1349
+ message: ChatMessage
1350
+ logprobs: Optional[ChatCompletionLogProbs] = None
1351
+ # per OpenAI spec this is the default
1352
+ finish_reason: Optional[str] = "stop"
1353
+ # not part of the OpenAI spec but included in vLLM for legacy reasons
1354
+ stop_reason: Optional[Union[int, str]] = None
1355
+
1356
+
1357
+ class ChatCompletionResponse(OpenAIBaseModel):
1358
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1359
+ object: Literal["chat.completion"] = "chat.completion"
1360
+ created: int = Field(default_factory=lambda: int(time.time()))
1361
+ model: str
1362
+ choices: list[ChatCompletionResponseChoice]
1363
+ usage: UsageInfo
1364
+ prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1365
+
1366
+
1367
+ class DeltaMessage(OpenAIBaseModel):
1368
+ role: Optional[str] = None
1369
+ content: Optional[str] = None
1370
+ reasoning_content: Optional[str] = None
1371
+ tool_calls: list[DeltaToolCall] = Field(default_factory=list)
1372
+
1373
+
1374
+ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
1375
+ index: int
1376
+ delta: DeltaMessage
1377
+ logprobs: Optional[ChatCompletionLogProbs] = None
1378
+ finish_reason: Optional[str] = None
1379
+ stop_reason: Optional[Union[int, str]] = None
1380
+
1381
+
1382
+ class ChatCompletionStreamResponse(OpenAIBaseModel):
1383
+ id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
1384
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
1385
+ created: int = Field(default_factory=lambda: int(time.time()))
1386
+ model: str
1387
+ choices: list[ChatCompletionResponseStreamChoice]
1388
+ usage: Optional[UsageInfo] = Field(default=None)
1389
+
1390
+
1391
+ class TranscriptionResponseStreamChoice(OpenAIBaseModel):
1392
+ delta: DeltaMessage
1393
+ finish_reason: Optional[str] = None
1394
+ stop_reason: Optional[Union[int, str]] = None
1395
+
1396
+
1397
+ class TranscriptionStreamResponse(OpenAIBaseModel):
1398
+ id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
1399
+ object: Literal["transcription.chunk"] = "transcription.chunk"
1400
+ created: int = Field(default_factory=lambda: int(time.time()))
1401
+ model: str
1402
+ choices: list[TranscriptionResponseStreamChoice]
1403
+ usage: Optional[UsageInfo] = Field(default=None)
1404
+
1405
+
1406
+ class BatchRequestInput(OpenAIBaseModel):
1407
+ """
1408
+ The per-line object of the batch input file.
1409
+
1410
+ NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
1411
+ """
1412
+
1413
+ # A developer-provided per-request id that will be used to match outputs to
1414
+ # inputs. Must be unique for each request in a batch.
1415
+ custom_id: str
1416
+
1417
+ # The HTTP method to be used for the request. Currently only POST is
1418
+ # supported.
1419
+ method: str
1420
+
1421
+ # The OpenAI API relative URL to be used for the request. Currently
1422
+ # /v1/chat/completions is supported.
1423
+ url: str
1424
+
1425
+ # The parameters of the request.
1426
+ body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest]
1427
+
1428
+ @field_validator('body', mode='plain')
1429
+ @classmethod
1430
+ def check_type_for_url(cls, value: Any, info: ValidationInfo):
1431
+ # Use url to disambiguate models
1432
+ url = info.data['url']
1433
+ if url == "/v1/chat/completions":
1434
+ return ChatCompletionRequest.model_validate(value)
1435
+ if url == "/v1/embeddings":
1436
+ return TypeAdapter(EmbeddingRequest).validate_python(value)
1437
+ if url == "/v1/score":
1438
+ return ScoreRequest.model_validate(value)
1439
+ return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest,
1440
+ ScoreRequest]).validate_python(value)
1441
+
1442
+
1443
+ class BatchResponseData(OpenAIBaseModel):
1444
+ # HTTP status code of the response.
1445
+ status_code: int = 200
1446
+
1447
+ # An unique identifier for the API request.
1448
+ request_id: str
1449
+
1450
+ # The body of the response.
1451
+ body: Optional[Union[ChatCompletionResponse, EmbeddingResponse,
1452
+ ScoreResponse]] = None
1453
+
1454
+
1455
+ class BatchRequestOutput(OpenAIBaseModel):
1456
+ """
1457
+ The per-line object of the batch output and error files
1458
+ """
1459
+
1460
+ id: str
1461
+
1462
+ # A developer-provided per-request id that will be used to match outputs to
1463
+ # inputs.
1464
+ custom_id: str
1465
+
1466
+ response: Optional[BatchResponseData]
1467
+
1468
+ # For requests that failed with a non-HTTP error, this will contain more
1469
+ # information on the cause of the failure.
1470
+ error: Optional[Any]
1471
+
1472
+
1473
+ class TokenizeCompletionRequest(OpenAIBaseModel):
1474
+ model: Optional[str] = None
1475
+ prompt: str
1476
+
1477
+ add_special_tokens: bool = Field(
1478
+ default=True,
1479
+ description=(
1480
+ "If true (the default), special tokens (e.g. BOS) will be added to "
1481
+ "the prompt."),
1482
+ )
1483
+
1484
+
1485
+ class TokenizeChatRequest(OpenAIBaseModel):
1486
+ model: Optional[str] = None
1487
+ messages: list[ChatCompletionMessageParam]
1488
+
1489
+ add_generation_prompt: bool = Field(
1490
+ default=True,
1491
+ description=
1492
+ ("If true, the generation prompt will be added to the chat template. "
1493
+ "This is a parameter used by chat template in tokenizer config of the "
1494
+ "model."),
1495
+ )
1496
+ continue_final_message: bool = Field(
1497
+ default=False,
1498
+ description=
1499
+ ("If this is set, the chat will be formatted so that the final "
1500
+ "message in the chat is open-ended, without any EOS tokens. The "
1501
+ "model will continue this message rather than starting a new one. "
1502
+ "This allows you to \"prefill\" part of the model's response for it. "
1503
+ "Cannot be used at the same time as `add_generation_prompt`."),
1504
+ )
1505
+ add_special_tokens: bool = Field(
1506
+ default=False,
1507
+ description=(
1508
+ "If true, special tokens (e.g. BOS) will be added to the prompt "
1509
+ "on top of what is added by the chat template. "
1510
+ "For most models, the chat template takes care of adding the "
1511
+ "special tokens so this should be set to false (as is the "
1512
+ "default)."),
1513
+ )
1514
+ chat_template: Optional[str] = Field(
1515
+ default=None,
1516
+ description=(
1517
+ "A Jinja template to use for this conversion. "
1518
+ "As of transformers v4.44, default chat template is no longer "
1519
+ "allowed, so you must provide a chat template if the tokenizer "
1520
+ "does not define one."),
1521
+ )
1522
+ chat_template_kwargs: Optional[dict[str, Any]] = Field(
1523
+ default=None,
1524
+ description=("Additional kwargs to pass to the template renderer. "
1525
+ "Will be accessible by the chat template."),
1526
+ )
1527
+ mm_processor_kwargs: Optional[dict[str, Any]] = Field(
1528
+ default=None,
1529
+ description=("Additional kwargs to pass to the HF processor."),
1530
+ )
1531
+
1532
+ @model_validator(mode="before")
1533
+ @classmethod
1534
+ def check_generation_prompt(cls, data):
1535
+ if data.get("continue_final_message") and data.get(
1536
+ "add_generation_prompt"):
1537
+ raise ValueError("Cannot set both `continue_final_message` and "
1538
+ "`add_generation_prompt` to True.")
1539
+ return data
1540
+
1541
+
1542
+ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
1543
+
1544
+
1545
+ class TokenizeResponse(OpenAIBaseModel):
1546
+ count: int
1547
+ max_model_len: int
1548
+ tokens: list[int]
1549
+
1550
+
1551
+ class DetokenizeRequest(OpenAIBaseModel):
1552
+ model: Optional[str] = None
1553
+ tokens: list[int]
1554
+
1555
+
1556
+ class DetokenizeResponse(OpenAIBaseModel):
1557
+ prompt: str
1558
+
1559
+
1560
+ class LoadLoRAAdapterRequest(BaseModel):
1561
+ lora_name: str
1562
+ lora_path: str
1563
+
1564
+
1565
+ class UnloadLoRAAdapterRequest(BaseModel):
1566
+ lora_name: str
1567
+ lora_int_id: Optional[int] = Field(default=None)
1568
+
1569
+
1570
+ ## Protocols for Audio
1571
+ AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
1572
+ "vtt"]
1573
+
1574
+
1575
+ class TranscriptionRequest(OpenAIBaseModel):
1576
+ # Ordered by official OpenAI API documentation
1577
+ # https://platform.openai.com/docs/api-reference/audio/createTranscription
1578
+
1579
+ file: UploadFile
1580
+ """
1581
+ The audio file object (not file name) to transcribe, in one of these
1582
+ formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
1583
+ """
1584
+
1585
+ model: Optional[str] = None
1586
+ """ID of the model to use.
1587
+ """
1588
+
1589
+ language: Optional[str] = None
1590
+ """The language of the input audio.
1591
+
1592
+ Supplying the input language in
1593
+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
1594
+ will improve accuracy and latency.
1595
+ """
1596
+
1597
+ prompt: str = Field(default="")
1598
+ """An optional text to guide the model's style or continue a previous audio
1599
+ segment.
1600
+
1601
+ The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
1602
+ should match the audio language.
1603
+ """
1604
+
1605
+ response_format: AudioResponseFormat = Field(default="json")
1606
+ """
1607
+ The format of the output, in one of these options: `json`, `text`, `srt`,
1608
+ `verbose_json`, or `vtt`.
1609
+ """
1610
+
1611
+ ## TODO (varun) : Support if set to 0, certain thresholds are met !!
1612
+
1613
+ timestamp_granularities: list[Literal["word", "segment"]] = Field(
1614
+ alias="timestamp_granularities[]", default=[])
1615
+ """The timestamp granularities to populate for this transcription.
1616
+
1617
+ `response_format` must be set `verbose_json` to use timestamp granularities.
1618
+ Either or both of these options are supported: `word`, or `segment`. Note:
1619
+ There is no additional latency for segment timestamps, but generating word
1620
+ timestamps incurs additional latency.
1621
+ """
1622
+
1623
+ # doc: begin-transcription-extra-params
1624
+ stream: Optional[bool] = False
1625
+ """Custom field not present in the original OpenAI definition. When set,
1626
+ it will enable output to be streamed in a similar fashion as the Chat
1627
+ Completion endpoint.
1628
+ """
1629
+ # Flattened stream option to simplify form data.
1630
+ stream_include_usage: Optional[bool] = False
1631
+ stream_continuous_usage_stats: Optional[bool] = False
1632
+ # doc: end-transcription-extra-params
1633
+
1634
+ # doc: begin-transcription-sampling-params
1635
+ temperature: float = Field(default=0.0)
1636
+ """The sampling temperature, between 0 and 1.
1637
+
1638
+ Higher values like 0.8 will make the output more random, while lower values
1639
+ like 0.2 will make it more focused / deterministic. If set to 0, the model
1640
+ will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
1641
+ to automatically increase the temperature until certain thresholds are hit.
1642
+ """
1643
+
1644
+ top_p: Optional[float] = None
1645
+ """Enables nucleus (top-p) sampling, where tokens are selected from the
1646
+ smallest possible set whose cumulative probability exceeds `p`.
1647
+ """
1648
+
1649
+ top_k: Optional[int] = None
1650
+ """Limits sampling to the `k` most probable tokens at each step."""
1651
+
1652
+ min_p: Optional[float] = None
1653
+ """Filters out tokens with a probability lower than `min_p`, ensuring a
1654
+ minimum likelihood threshold during sampling.
1655
+ """
1656
+
1657
+ seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
1658
+ """The seed to use for sampling."""
1659
+
1660
+ frequency_penalty: Optional[float] = 0.0
1661
+ """The frequency penalty to use for sampling."""
1662
+
1663
+ repetition_penalty: Optional[float] = None
1664
+ """The repetition penalty to use for sampling."""
1665
+
1666
+ presence_penalty: Optional[float] = 0.0
1667
+ """The presence penalty to use for sampling."""
1668
+ # doc: end-transcription-sampling-params
1669
+
1670
+ # Default sampling parameters for transcription requests.
1671
+ _DEFAULT_SAMPLING_PARAMS: dict = {
1672
+ "repetition_penalty": 1.0,
1673
+ "temperature": 1.0,
1674
+ "top_p": 1.0,
1675
+ "top_k": -1,
1676
+ "min_p": 0.0,
1677
+ }
1678
+
1679
+ def to_sampling_params(
1680
+ self,
1681
+ default_max_tokens: int,
1682
+ default_sampling_params: Optional[dict] = None) -> SamplingParams:
1683
+ # TODO(#9845): remove max_tokens when field is removed from OpenAI API
1684
+ max_tokens = default_max_tokens
1685
+
1686
+ if default_sampling_params is None:
1687
+ default_sampling_params = {}
1688
+
1689
+ # Default parameters
1690
+ if (temperature := self.temperature) is None:
1691
+ temperature = default_sampling_params.get(
1692
+ "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
1693
+ if (top_p := self.top_p) is None:
1694
+ top_p = default_sampling_params.get(
1695
+ "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
1696
+ if (top_k := self.top_k) is None:
1697
+ top_k = default_sampling_params.get(
1698
+ "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"])
1699
+ if (min_p := self.min_p) is None:
1700
+ min_p = default_sampling_params.get(
1701
+ "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"])
1702
+
1703
+ if (repetition_penalty := self.repetition_penalty) is None:
1704
+ repetition_penalty = default_sampling_params.get(
1705
+ "repetition_penalty",
1706
+ self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"])
1707
+
1708
+ return SamplingParams.from_optional(temperature=temperature,
1709
+ max_tokens=max_tokens,
1710
+ seed=self.seed,
1711
+ top_p=top_p,
1712
+ top_k=top_k,
1713
+ min_p=min_p,
1714
+ frequency_penalty=self.frequency_penalty,
1715
+ repetition_penalty=repetition_penalty,
1716
+ presence_penalty=self.presence_penalty,
1717
+ output_kind=RequestOutputKind.DELTA
1718
+ if self.stream \
1719
+ else RequestOutputKind.FINAL_ONLY)
1720
+
1721
+ @model_validator(mode="before")
1722
+ @classmethod
1723
+ def validate_stream_options(cls, data):
1724
+ stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
1725
+ stream = data.get("stream", False)
1726
+ if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
1727
+ raise ValueError(
1728
+ "Stream options can only be defined when `stream=True`.")
1729
+
1730
+ return data
1731
+
1732
+
1733
+ # Transcription response objects
1734
+ class TranscriptionResponse(OpenAIBaseModel):
1735
+ text: str
1736
+ """The transcribed text."""
1737
+
1738
+
1739
+ class TranscriptionWord(OpenAIBaseModel):
1740
+ end: float
1741
+ """End time of the word in seconds."""
1742
+
1743
+ start: float
1744
+ """Start time of the word in seconds."""
1745
+
1746
+ word: str
1747
+ """The text content of the word."""
1748
+
1749
+
1750
+ class TranscriptionSegment(OpenAIBaseModel):
1751
+ id: int
1752
+ """Unique identifier of the segment."""
1753
+
1754
+ avg_logprob: float
1755
+ """Average logprob of the segment.
1756
+
1757
+ If the value is lower than -1, consider the logprobs failed.
1758
+ """
1759
+
1760
+ compression_ratio: float
1761
+ """Compression ratio of the segment.
1762
+
1763
+ If the value is greater than 2.4, consider the compression failed.
1764
+ """
1765
+
1766
+ end: float
1767
+ """End time of the segment in seconds."""
1768
+
1769
+ no_speech_prob: float
1770
+ """Probability of no speech in the segment.
1771
+
1772
+ If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
1773
+ this segment silent.
1774
+ """
1775
+
1776
+ seek: int
1777
+ """Seek offset of the segment."""
1778
+
1779
+ start: float
1780
+ """Start time of the segment in seconds."""
1781
+
1782
+ temperature: float
1783
+ """Temperature parameter used for generating the segment."""
1784
+
1785
+ text: str
1786
+ """Text content of the segment."""
1787
+
1788
+ tokens: list[int]
1789
+ """Array of token IDs for the text content."""
1790
+
1791
+
1792
+ class TranscriptionResponseVerbose(OpenAIBaseModel):
1793
+ duration: str
1794
+ """The duration of the input audio."""
1795
+
1796
+ language: str
1797
+ """The language of the input audio."""
1798
+
1799
+ text: str
1800
+ """The transcribed text."""
1801
+
1802
+ segments: Optional[list[TranscriptionSegment]] = None
1803
+ """Segments of the transcribed text and their corresponding details."""
1804
+
1805
+ words: Optional[list[TranscriptionWord]] = None
1806
+ """Extracted words and their corresponding timestamps."""