vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1398) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2044 -0
  5. vllm/_ipex_ops.py +393 -0
  6. vllm/_version.py +34 -0
  7. vllm/assets/__init__.py +0 -0
  8. vllm/assets/audio.py +45 -0
  9. vllm/assets/base.py +41 -0
  10. vllm/assets/image.py +50 -0
  11. vllm/assets/video.py +145 -0
  12. vllm/attention/__init__.py +15 -0
  13. vllm/attention/backends/__init__.py +0 -0
  14. vllm/attention/backends/abstract.py +204 -0
  15. vllm/attention/backends/utils.py +33 -0
  16. vllm/attention/layer.py +645 -0
  17. vllm/attention/layers/__init__.py +0 -0
  18. vllm/attention/layers/chunked_local_attention.py +93 -0
  19. vllm/attention/layers/cross_attention.py +162 -0
  20. vllm/attention/layers/encoder_only_attention.py +86 -0
  21. vllm/attention/ops/__init__.py +0 -0
  22. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  23. vllm/attention/ops/common.py +345 -0
  24. vllm/attention/ops/flashmla.py +192 -0
  25. vllm/attention/ops/merge_attn_states.py +43 -0
  26. vllm/attention/ops/paged_attn.py +262 -0
  27. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  28. vllm/attention/ops/prefix_prefill.py +928 -0
  29. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  30. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  31. vllm/attention/ops/triton_decode_attention.py +691 -0
  32. vllm/attention/ops/triton_flash_attention.py +984 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +175 -0
  35. vllm/attention/ops/triton_unified_attention.py +894 -0
  36. vllm/attention/selector.py +245 -0
  37. vllm/attention/utils/__init__.py +0 -0
  38. vllm/attention/utils/fa_utils.py +85 -0
  39. vllm/attention/utils/kv_sharing_utils.py +33 -0
  40. vllm/beam_search.py +87 -0
  41. vllm/benchmarks/__init__.py +0 -0
  42. vllm/benchmarks/datasets.py +2723 -0
  43. vllm/benchmarks/latency.py +170 -0
  44. vllm/benchmarks/lib/__init__.py +3 -0
  45. vllm/benchmarks/lib/endpoint_request_func.py +533 -0
  46. vllm/benchmarks/lib/ready_checker.py +73 -0
  47. vllm/benchmarks/lib/utils.py +80 -0
  48. vllm/benchmarks/serve.py +1358 -0
  49. vllm/benchmarks/throughput.py +696 -0
  50. vllm/collect_env.py +823 -0
  51. vllm/compilation/__init__.py +0 -0
  52. vllm/compilation/activation_quant_fusion.py +189 -0
  53. vllm/compilation/backends.py +650 -0
  54. vllm/compilation/base_static_graph.py +56 -0
  55. vllm/compilation/collective_fusion.py +1188 -0
  56. vllm/compilation/compiler_interface.py +573 -0
  57. vllm/compilation/counter.py +47 -0
  58. vllm/compilation/cuda_graph.py +199 -0
  59. vllm/compilation/cuda_piecewise_backend.py +117 -0
  60. vllm/compilation/decorators.py +400 -0
  61. vllm/compilation/fix_functionalization.py +205 -0
  62. vllm/compilation/fusion.py +383 -0
  63. vllm/compilation/fusion_attn.py +295 -0
  64. vllm/compilation/fx_utils.py +84 -0
  65. vllm/compilation/inductor_pass.py +136 -0
  66. vllm/compilation/monitor.py +57 -0
  67. vllm/compilation/noop_elimination.py +158 -0
  68. vllm/compilation/pass_manager.py +125 -0
  69. vllm/compilation/post_cleanup.py +20 -0
  70. vllm/compilation/sequence_parallelism.py +478 -0
  71. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  72. vllm/compilation/vllm_inductor_pass.py +156 -0
  73. vllm/compilation/wrapper.py +136 -0
  74. vllm/config/__init__.py +814 -0
  75. vllm/config/cache.py +220 -0
  76. vllm/config/compilation.py +673 -0
  77. vllm/config/device.py +74 -0
  78. vllm/config/kv_events.py +50 -0
  79. vllm/config/kv_transfer.py +111 -0
  80. vllm/config/load.py +113 -0
  81. vllm/config/lora.py +132 -0
  82. vllm/config/model.py +1912 -0
  83. vllm/config/multimodal.py +129 -0
  84. vllm/config/observability.py +99 -0
  85. vllm/config/parallel.py +524 -0
  86. vllm/config/pooler.py +97 -0
  87. vllm/config/scheduler.py +287 -0
  88. vllm/config/speculative.py +568 -0
  89. vllm/config/speech_to_text.py +39 -0
  90. vllm/config/structured_outputs.py +64 -0
  91. vllm/config/utils.py +145 -0
  92. vllm/connections.py +186 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +311 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +41 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +440 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +317 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +295 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +323 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +28 -0
  106. vllm/distributed/device_communicators/pynccl.py +340 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +186 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +416 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +589 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +635 -0
  113. vllm/distributed/device_communicators/symm_mem.py +136 -0
  114. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  115. vllm/distributed/device_communicators/xpu_communicator.py +94 -0
  116. vllm/distributed/eplb/__init__.py +8 -0
  117. vllm/distributed/eplb/eplb_state.py +620 -0
  118. vllm/distributed/eplb/rebalance_algo.py +239 -0
  119. vllm/distributed/eplb/rebalance_execute.py +424 -0
  120. vllm/distributed/kv_events.py +362 -0
  121. vllm/distributed/kv_transfer/README.md +29 -0
  122. vllm/distributed/kv_transfer/__init__.py +13 -0
  123. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  124. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  125. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  126. vllm/distributed/kv_transfer/kv_connector/factory.py +113 -0
  127. vllm/distributed/kv_transfer/kv_connector/utils.py +261 -0
  128. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  129. vllm/distributed/kv_transfer/kv_connector/v1/base.py +388 -0
  130. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +168 -0
  131. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +100 -0
  132. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +328 -0
  133. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1473 -0
  134. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +485 -0
  135. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +488 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +550 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +267 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +418 -0
  140. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  141. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  142. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  143. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  144. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  145. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  146. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  147. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  148. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  149. vllm/distributed/parallel_state.py +1532 -0
  150. vllm/distributed/tpu_distributed_utils.py +178 -0
  151. vllm/distributed/utils.py +536 -0
  152. vllm/engine/__init__.py +0 -0
  153. vllm/engine/arg_utils.py +1778 -0
  154. vllm/engine/async_llm_engine.py +6 -0
  155. vllm/engine/llm_engine.py +6 -0
  156. vllm/engine/metrics.py +577 -0
  157. vllm/engine/metrics_types.py +84 -0
  158. vllm/engine/protocol.py +333 -0
  159. vllm/entrypoints/__init__.py +0 -0
  160. vllm/entrypoints/api_server.py +178 -0
  161. vllm/entrypoints/chat_utils.py +1705 -0
  162. vllm/entrypoints/cli/__init__.py +12 -0
  163. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  164. vllm/entrypoints/cli/benchmark/base.py +25 -0
  165. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  166. vllm/entrypoints/cli/benchmark/main.py +55 -0
  167. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  168. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  169. vllm/entrypoints/cli/collect_env.py +36 -0
  170. vllm/entrypoints/cli/main.py +60 -0
  171. vllm/entrypoints/cli/openai.py +233 -0
  172. vllm/entrypoints/cli/run_batch.py +67 -0
  173. vllm/entrypoints/cli/serve.py +232 -0
  174. vllm/entrypoints/cli/types.py +29 -0
  175. vllm/entrypoints/constants.py +10 -0
  176. vllm/entrypoints/context.py +481 -0
  177. vllm/entrypoints/harmony_utils.py +436 -0
  178. vllm/entrypoints/launcher.py +164 -0
  179. vllm/entrypoints/llm.py +1629 -0
  180. vllm/entrypoints/logger.py +79 -0
  181. vllm/entrypoints/openai/__init__.py +0 -0
  182. vllm/entrypoints/openai/api_server.py +1953 -0
  183. vllm/entrypoints/openai/cli_args.py +288 -0
  184. vllm/entrypoints/openai/logits_processors.py +90 -0
  185. vllm/entrypoints/openai/protocol.py +2757 -0
  186. vllm/entrypoints/openai/run_batch.py +491 -0
  187. vllm/entrypoints/openai/serving_chat.py +1597 -0
  188. vllm/entrypoints/openai/serving_classification.py +173 -0
  189. vllm/entrypoints/openai/serving_completion.py +692 -0
  190. vllm/entrypoints/openai/serving_embedding.py +631 -0
  191. vllm/entrypoints/openai/serving_engine.py +992 -0
  192. vllm/entrypoints/openai/serving_models.py +288 -0
  193. vllm/entrypoints/openai/serving_pooling.py +276 -0
  194. vllm/entrypoints/openai/serving_responses.py +1709 -0
  195. vllm/entrypoints/openai/serving_score.py +479 -0
  196. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  197. vllm/entrypoints/openai/serving_transcription.py +136 -0
  198. vllm/entrypoints/openai/speech_to_text.py +388 -0
  199. vllm/entrypoints/openai/tool_parsers/__init__.py +55 -0
  200. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  201. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  202. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  203. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  204. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  205. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  206. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +455 -0
  207. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  208. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  209. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  210. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  211. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  212. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  213. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +39 -0
  214. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  215. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  216. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +93 -0
  217. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  218. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  219. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  220. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1137 -0
  221. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  222. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  223. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  224. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  225. vllm/entrypoints/renderer.py +395 -0
  226. vllm/entrypoints/score_utils.py +232 -0
  227. vllm/entrypoints/ssl.py +75 -0
  228. vllm/entrypoints/tool.py +139 -0
  229. vllm/entrypoints/tool_server.py +206 -0
  230. vllm/entrypoints/utils.py +233 -0
  231. vllm/env_override.py +23 -0
  232. vllm/envs.py +1590 -0
  233. vllm/executor/__init__.py +0 -0
  234. vllm/executor/executor_base.py +381 -0
  235. vllm/executor/msgspec_utils.py +35 -0
  236. vllm/executor/ray_distributed_executor.py +699 -0
  237. vllm/executor/ray_utils.py +410 -0
  238. vllm/executor/uniproc_executor.py +176 -0
  239. vllm/forward_context.py +402 -0
  240. vllm/inputs/__init__.py +30 -0
  241. vllm/inputs/data.py +356 -0
  242. vllm/inputs/parse.py +151 -0
  243. vllm/inputs/preprocess.py +664 -0
  244. vllm/logger.py +229 -0
  245. vllm/logging_utils/__init__.py +10 -0
  246. vllm/logging_utils/dump_input.py +81 -0
  247. vllm/logging_utils/formatter.py +79 -0
  248. vllm/logging_utils/log_time.py +32 -0
  249. vllm/logits_process.py +119 -0
  250. vllm/logprobs.py +28 -0
  251. vllm/lora/__init__.py +0 -0
  252. vllm/lora/layers/__init__.py +34 -0
  253. vllm/lora/layers/base.py +69 -0
  254. vllm/lora/layers/base_linear.py +185 -0
  255. vllm/lora/layers/column_parallel_linear.py +609 -0
  256. vllm/lora/layers/logits_processor.py +247 -0
  257. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  258. vllm/lora/layers/replicated_linear.py +60 -0
  259. vllm/lora/layers/row_parallel_linear.py +196 -0
  260. vllm/lora/layers/utils.py +65 -0
  261. vllm/lora/layers/vocal_parallel_embedding.py +174 -0
  262. vllm/lora/lora_weights.py +199 -0
  263. vllm/lora/models.py +816 -0
  264. vllm/lora/ops/__init__.py +0 -0
  265. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  266. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  267. vllm/lora/ops/torch_ops/__init__.py +16 -0
  268. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  269. vllm/lora/ops/triton_ops/__init__.py +12 -0
  270. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  271. vllm/lora/ops/triton_ops/lora_expand_op.py +289 -0
  272. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  273. vllm/lora/ops/triton_ops/lora_shrink_op.py +243 -0
  274. vllm/lora/ops/triton_ops/utils.py +126 -0
  275. vllm/lora/ops/xla_ops/__init__.py +7 -0
  276. vllm/lora/ops/xla_ops/lora_ops.py +144 -0
  277. vllm/lora/peft_helper.py +127 -0
  278. vllm/lora/punica_wrapper/__init__.py +10 -0
  279. vllm/lora/punica_wrapper/punica_base.py +458 -0
  280. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  281. vllm/lora/punica_wrapper/punica_gpu.py +272 -0
  282. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  283. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  284. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  285. vllm/lora/punica_wrapper/utils.py +136 -0
  286. vllm/lora/request.py +97 -0
  287. vllm/lora/resolver.py +85 -0
  288. vllm/lora/utils.py +246 -0
  289. vllm/lora/worker_manager.py +267 -0
  290. vllm/model_executor/__init__.py +12 -0
  291. vllm/model_executor/custom_op.py +194 -0
  292. vllm/model_executor/layers/__init__.py +0 -0
  293. vllm/model_executor/layers/activation.py +575 -0
  294. vllm/model_executor/layers/attention_layer_base.py +23 -0
  295. vllm/model_executor/layers/fla/__init__.py +8 -0
  296. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  297. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  298. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  299. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  300. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  301. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  302. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  303. vllm/model_executor/layers/fla/ops/index.py +39 -0
  304. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  305. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  306. vllm/model_executor/layers/fla/ops/op.py +39 -0
  307. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  308. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  309. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  310. vllm/model_executor/layers/fused_moe/__init__.py +89 -0
  311. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +322 -0
  312. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +141 -0
  313. vllm/model_executor/layers/fused_moe/config.py +804 -0
  314. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  315. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  316. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  317. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  318. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  319. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  320. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  321. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  322. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  323. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  324. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  325. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  326. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  327. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  328. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  329. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  330. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  331. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  332. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  333. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  334. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  335. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  336. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  337. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  338. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  339. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  545. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +300 -0
  546. vllm/model_executor/layers/fused_moe/cutlass_moe.py +957 -0
  547. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +362 -0
  548. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  549. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +361 -0
  550. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +274 -0
  551. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +268 -0
  552. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +300 -0
  553. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +184 -0
  554. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +993 -0
  555. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +239 -0
  556. vllm/model_executor/layers/fused_moe/fused_moe.py +1890 -0
  557. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +307 -0
  558. vllm/model_executor/layers/fused_moe/layer.py +2195 -0
  559. vllm/model_executor/layers/fused_moe/modular_kernel.py +1038 -0
  560. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  561. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  562. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  563. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  564. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +341 -0
  565. vllm/model_executor/layers/fused_moe/prepare_finalize.py +70 -0
  566. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +424 -0
  567. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  568. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  569. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +143 -0
  570. vllm/model_executor/layers/fused_moe/trtllm_moe.py +191 -0
  571. vllm/model_executor/layers/fused_moe/utils.py +274 -0
  572. vllm/model_executor/layers/layernorm.py +395 -0
  573. vllm/model_executor/layers/lightning_attn.py +661 -0
  574. vllm/model_executor/layers/linear.py +1603 -0
  575. vllm/model_executor/layers/logits_processor.py +106 -0
  576. vllm/model_executor/layers/mamba/__init__.py +0 -0
  577. vllm/model_executor/layers/mamba/abstract.py +42 -0
  578. vllm/model_executor/layers/mamba/linear_attn.py +403 -0
  579. vllm/model_executor/layers/mamba/mamba_mixer.py +466 -0
  580. vllm/model_executor/layers/mamba/mamba_mixer2.py +764 -0
  581. vllm/model_executor/layers/mamba/mamba_utils.py +186 -0
  582. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  583. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1092 -0
  584. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  585. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  586. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +242 -0
  587. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +527 -0
  588. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +724 -0
  589. vllm/model_executor/layers/mamba/ops/ssd_combined.py +238 -0
  590. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +200 -0
  591. vllm/model_executor/layers/mamba/short_conv.py +253 -0
  592. vllm/model_executor/layers/mla.py +173 -0
  593. vllm/model_executor/layers/pooler.py +719 -0
  594. vllm/model_executor/layers/quantization/__init__.py +157 -0
  595. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  596. vllm/model_executor/layers/quantization/awq.py +228 -0
  597. vllm/model_executor/layers/quantization/awq_marlin.py +554 -0
  598. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  599. vllm/model_executor/layers/quantization/base_config.py +170 -0
  600. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  601. vllm/model_executor/layers/quantization/bitsandbytes.py +627 -0
  602. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  603. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +797 -0
  604. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2074 -0
  605. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  606. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  607. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  608. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  609. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  610. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +185 -0
  611. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  612. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  613. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  614. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +157 -0
  615. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  616. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +238 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +153 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +46 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  625. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  626. vllm/model_executor/layers/quantization/experts_int8.py +223 -0
  627. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  628. vllm/model_executor/layers/quantization/fp8.py +1098 -0
  629. vllm/model_executor/layers/quantization/gguf.py +599 -0
  630. vllm/model_executor/layers/quantization/gptq.py +340 -0
  631. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  632. vllm/model_executor/layers/quantization/gptq_marlin.py +751 -0
  633. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  634. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  635. vllm/model_executor/layers/quantization/inc.py +61 -0
  636. vllm/model_executor/layers/quantization/input_quant_fp8.py +156 -0
  637. vllm/model_executor/layers/quantization/ipex_quant.py +415 -0
  638. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  639. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  640. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  641. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  642. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  643. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  644. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  645. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  646. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  647. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  648. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  649. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  650. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  651. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +161 -0
  652. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  653. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  654. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  655. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  656. vllm/model_executor/layers/quantization/kv_cache.py +143 -0
  657. vllm/model_executor/layers/quantization/modelopt.py +1596 -0
  658. vllm/model_executor/layers/quantization/moe_wna16.py +484 -0
  659. vllm/model_executor/layers/quantization/mxfp4.py +988 -0
  660. vllm/model_executor/layers/quantization/petit.py +306 -0
  661. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  662. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  663. vllm/model_executor/layers/quantization/quark/quark.py +432 -0
  664. vllm/model_executor/layers/quantization/quark/quark_moe.py +561 -0
  665. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  666. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  667. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +239 -0
  668. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  669. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  670. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  671. vllm/model_executor/layers/quantization/rtn.py +466 -0
  672. vllm/model_executor/layers/quantization/schema.py +86 -0
  673. vllm/model_executor/layers/quantization/torchao.py +214 -0
  674. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  675. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  676. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  677. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  678. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  679. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  680. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  681. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  682. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  683. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  684. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  685. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  686. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  687. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  688. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  689. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  690. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  691. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  888. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  889. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +79 -0
  890. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +248 -0
  891. vllm/model_executor/layers/quantization/utils/fp8_utils.py +949 -0
  892. vllm/model_executor/layers/quantization/utils/gptq_utils.py +146 -0
  893. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  894. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  895. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  896. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  897. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  898. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  899. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  900. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  901. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +141 -0
  902. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  903. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  904. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  905. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  906. vllm/model_executor/layers/quantization/utils/quant_utils.py +641 -0
  907. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  908. vllm/model_executor/layers/resampler.py +270 -0
  909. vllm/model_executor/layers/rotary_embedding/__init__.py +204 -0
  910. vllm/model_executor/layers/rotary_embedding/base.py +177 -0
  911. vllm/model_executor/layers/rotary_embedding/common.py +150 -0
  912. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +138 -0
  913. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  914. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  915. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  916. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  917. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  918. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  919. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  920. vllm/model_executor/layers/rotary_embedding/mrope.py +1321 -0
  921. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  922. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  923. vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py +86 -0
  924. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  925. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  926. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  927. vllm/model_executor/layers/utils.py +195 -0
  928. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  929. vllm/model_executor/model_loader/__init__.py +138 -0
  930. vllm/model_executor/model_loader/base_loader.py +52 -0
  931. vllm/model_executor/model_loader/bitsandbytes_loader.py +788 -0
  932. vllm/model_executor/model_loader/default_loader.py +277 -0
  933. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  934. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  935. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  936. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  937. vllm/model_executor/model_loader/tensorizer.py +738 -0
  938. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  939. vllm/model_executor/model_loader/tpu.py +114 -0
  940. vllm/model_executor/model_loader/utils.py +292 -0
  941. vllm/model_executor/model_loader/weight_utils.py +990 -0
  942. vllm/model_executor/models/__init__.py +33 -0
  943. vllm/model_executor/models/adapters.py +542 -0
  944. vllm/model_executor/models/aimv2.py +246 -0
  945. vllm/model_executor/models/apertus.py +579 -0
  946. vllm/model_executor/models/arcee.py +422 -0
  947. vllm/model_executor/models/arctic.py +558 -0
  948. vllm/model_executor/models/aria.py +650 -0
  949. vllm/model_executor/models/aya_vision.py +468 -0
  950. vllm/model_executor/models/baichuan.py +474 -0
  951. vllm/model_executor/models/bailing_moe.py +642 -0
  952. vllm/model_executor/models/bamba.py +514 -0
  953. vllm/model_executor/models/bert.py +665 -0
  954. vllm/model_executor/models/bert_with_rope.py +687 -0
  955. vllm/model_executor/models/blip.py +339 -0
  956. vllm/model_executor/models/blip2.py +712 -0
  957. vllm/model_executor/models/bloom.py +374 -0
  958. vllm/model_executor/models/chameleon.py +1139 -0
  959. vllm/model_executor/models/chatglm.py +476 -0
  960. vllm/model_executor/models/clip.py +407 -0
  961. vllm/model_executor/models/cohere2_vision.py +481 -0
  962. vllm/model_executor/models/commandr.py +465 -0
  963. vllm/model_executor/models/config.py +445 -0
  964. vllm/model_executor/models/dbrx.py +471 -0
  965. vllm/model_executor/models/deepseek.py +497 -0
  966. vllm/model_executor/models/deepseek_eagle.py +240 -0
  967. vllm/model_executor/models/deepseek_mtp.py +289 -0
  968. vllm/model_executor/models/deepseek_v2.py +1444 -0
  969. vllm/model_executor/models/deepseek_vl2.py +658 -0
  970. vllm/model_executor/models/dots1.py +546 -0
  971. vllm/model_executor/models/dots_ocr.py +873 -0
  972. vllm/model_executor/models/ernie45.py +43 -0
  973. vllm/model_executor/models/ernie45_moe.py +607 -0
  974. vllm/model_executor/models/ernie45_vl.py +1527 -0
  975. vllm/model_executor/models/ernie45_vl_moe.py +727 -0
  976. vllm/model_executor/models/ernie_mtp.py +268 -0
  977. vllm/model_executor/models/exaone.py +550 -0
  978. vllm/model_executor/models/exaone4.py +533 -0
  979. vllm/model_executor/models/fairseq2_llama.py +154 -0
  980. vllm/model_executor/models/falcon.py +509 -0
  981. vllm/model_executor/models/falcon_h1.py +674 -0
  982. vllm/model_executor/models/fuyu.py +399 -0
  983. vllm/model_executor/models/gemma.py +425 -0
  984. vllm/model_executor/models/gemma2.py +422 -0
  985. vllm/model_executor/models/gemma3.py +555 -0
  986. vllm/model_executor/models/gemma3_mm.py +721 -0
  987. vllm/model_executor/models/gemma3n.py +1113 -0
  988. vllm/model_executor/models/gemma3n_mm.py +761 -0
  989. vllm/model_executor/models/glm.py +23 -0
  990. vllm/model_executor/models/glm4.py +304 -0
  991. vllm/model_executor/models/glm4_1v.py +1690 -0
  992. vllm/model_executor/models/glm4_moe.py +727 -0
  993. vllm/model_executor/models/glm4_moe_mtp.py +301 -0
  994. vllm/model_executor/models/glm4v.py +654 -0
  995. vllm/model_executor/models/gpt2.py +380 -0
  996. vllm/model_executor/models/gpt_bigcode.py +344 -0
  997. vllm/model_executor/models/gpt_j.py +339 -0
  998. vllm/model_executor/models/gpt_neox.py +330 -0
  999. vllm/model_executor/models/gpt_oss.py +712 -0
  1000. vllm/model_executor/models/granite.py +489 -0
  1001. vllm/model_executor/models/granite_speech.py +794 -0
  1002. vllm/model_executor/models/granitemoe.py +550 -0
  1003. vllm/model_executor/models/granitemoehybrid.py +614 -0
  1004. vllm/model_executor/models/granitemoeshared.py +332 -0
  1005. vllm/model_executor/models/gritlm.py +262 -0
  1006. vllm/model_executor/models/grok1.py +547 -0
  1007. vllm/model_executor/models/h2ovl.py +536 -0
  1008. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1009. vllm/model_executor/models/hyperclovax_vision.py +1192 -0
  1010. vllm/model_executor/models/idefics2_vision_model.py +417 -0
  1011. vllm/model_executor/models/idefics3.py +756 -0
  1012. vllm/model_executor/models/interfaces.py +959 -0
  1013. vllm/model_executor/models/interfaces_base.py +192 -0
  1014. vllm/model_executor/models/intern_vit.py +441 -0
  1015. vllm/model_executor/models/internlm2.py +450 -0
  1016. vllm/model_executor/models/internlm2_ve.py +148 -0
  1017. vllm/model_executor/models/interns1.py +838 -0
  1018. vllm/model_executor/models/interns1_vit.py +418 -0
  1019. vllm/model_executor/models/internvl.py +1423 -0
  1020. vllm/model_executor/models/jais.py +373 -0
  1021. vllm/model_executor/models/jamba.py +591 -0
  1022. vllm/model_executor/models/jina_vl.py +144 -0
  1023. vllm/model_executor/models/keye.py +1680 -0
  1024. vllm/model_executor/models/keye_vl1_5.py +602 -0
  1025. vllm/model_executor/models/kimi_vl.py +618 -0
  1026. vllm/model_executor/models/lfm2.py +548 -0
  1027. vllm/model_executor/models/llama.py +669 -0
  1028. vllm/model_executor/models/llama4.py +746 -0
  1029. vllm/model_executor/models/llama4_eagle.py +239 -0
  1030. vllm/model_executor/models/llama_eagle.py +179 -0
  1031. vllm/model_executor/models/llama_eagle3.py +296 -0
  1032. vllm/model_executor/models/llava.py +870 -0
  1033. vllm/model_executor/models/llava_next.py +571 -0
  1034. vllm/model_executor/models/llava_next_video.py +476 -0
  1035. vllm/model_executor/models/llava_onevision.py +942 -0
  1036. vllm/model_executor/models/longcat_flash.py +715 -0
  1037. vllm/model_executor/models/longcat_flash_mtp.py +352 -0
  1038. vllm/model_executor/models/mamba.py +275 -0
  1039. vllm/model_executor/models/mamba2.py +291 -0
  1040. vllm/model_executor/models/medusa.py +169 -0
  1041. vllm/model_executor/models/midashenglm.py +792 -0
  1042. vllm/model_executor/models/mimo.py +188 -0
  1043. vllm/model_executor/models/mimo_mtp.py +280 -0
  1044. vllm/model_executor/models/minicpm.py +631 -0
  1045. vllm/model_executor/models/minicpm3.py +230 -0
  1046. vllm/model_executor/models/minicpm_eagle.py +389 -0
  1047. vllm/model_executor/models/minicpmo.py +770 -0
  1048. vllm/model_executor/models/minicpmv.py +1784 -0
  1049. vllm/model_executor/models/minimax_text_01.py +986 -0
  1050. vllm/model_executor/models/minimax_vl_01.py +426 -0
  1051. vllm/model_executor/models/mistral3.py +628 -0
  1052. vllm/model_executor/models/mixtral.py +606 -0
  1053. vllm/model_executor/models/mllama4.py +1076 -0
  1054. vllm/model_executor/models/mlp_speculator.py +206 -0
  1055. vllm/model_executor/models/modernbert.py +374 -0
  1056. vllm/model_executor/models/module_mapping.py +72 -0
  1057. vllm/model_executor/models/molmo.py +1567 -0
  1058. vllm/model_executor/models/moonvit.py +673 -0
  1059. vllm/model_executor/models/motif.py +345 -0
  1060. vllm/model_executor/models/mpt.py +329 -0
  1061. vllm/model_executor/models/nano_nemotron_vl.py +1394 -0
  1062. vllm/model_executor/models/nemotron.py +507 -0
  1063. vllm/model_executor/models/nemotron_h.py +565 -0
  1064. vllm/model_executor/models/nemotron_nas.py +481 -0
  1065. vllm/model_executor/models/nemotron_vl.py +652 -0
  1066. vllm/model_executor/models/nvlm_d.py +203 -0
  1067. vllm/model_executor/models/olmo.py +404 -0
  1068. vllm/model_executor/models/olmo2.py +439 -0
  1069. vllm/model_executor/models/olmoe.py +483 -0
  1070. vllm/model_executor/models/opt.py +412 -0
  1071. vllm/model_executor/models/orion.py +348 -0
  1072. vllm/model_executor/models/ovis.py +559 -0
  1073. vllm/model_executor/models/ovis2_5.py +642 -0
  1074. vllm/model_executor/models/paligemma.py +411 -0
  1075. vllm/model_executor/models/persimmon.py +343 -0
  1076. vllm/model_executor/models/phi.py +356 -0
  1077. vllm/model_executor/models/phi3.py +19 -0
  1078. vllm/model_executor/models/phi3v.py +698 -0
  1079. vllm/model_executor/models/phi4_multimodal.py +1475 -0
  1080. vllm/model_executor/models/phi4mm.py +1279 -0
  1081. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1082. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1083. vllm/model_executor/models/phimoe.py +679 -0
  1084. vllm/model_executor/models/pixtral.py +1345 -0
  1085. vllm/model_executor/models/plamo2.py +978 -0
  1086. vllm/model_executor/models/qwen.py +361 -0
  1087. vllm/model_executor/models/qwen2.py +523 -0
  1088. vllm/model_executor/models/qwen2_5_omni_thinker.py +984 -0
  1089. vllm/model_executor/models/qwen2_5_vl.py +1481 -0
  1090. vllm/model_executor/models/qwen2_audio.py +489 -0
  1091. vllm/model_executor/models/qwen2_moe.py +558 -0
  1092. vllm/model_executor/models/qwen2_rm.py +122 -0
  1093. vllm/model_executor/models/qwen2_vl.py +1670 -0
  1094. vllm/model_executor/models/qwen3.py +341 -0
  1095. vllm/model_executor/models/qwen3_moe.py +692 -0
  1096. vllm/model_executor/models/qwen3_next.py +1266 -0
  1097. vllm/model_executor/models/qwen3_next_mtp.py +281 -0
  1098. vllm/model_executor/models/qwen3_vl.py +1613 -0
  1099. vllm/model_executor/models/qwen3_vl_moe.py +358 -0
  1100. vllm/model_executor/models/qwen_vl.py +795 -0
  1101. vllm/model_executor/models/radio.py +576 -0
  1102. vllm/model_executor/models/registry.py +990 -0
  1103. vllm/model_executor/models/roberta.py +252 -0
  1104. vllm/model_executor/models/rvl.py +103 -0
  1105. vllm/model_executor/models/seed_oss.py +485 -0
  1106. vllm/model_executor/models/siglip.py +540 -0
  1107. vllm/model_executor/models/siglip2navit.py +689 -0
  1108. vllm/model_executor/models/skyworkr1v.py +911 -0
  1109. vllm/model_executor/models/smolvlm.py +44 -0
  1110. vllm/model_executor/models/solar.py +504 -0
  1111. vllm/model_executor/models/stablelm.py +341 -0
  1112. vllm/model_executor/models/starcoder2.py +354 -0
  1113. vllm/model_executor/models/step3_text.py +510 -0
  1114. vllm/model_executor/models/step3_vl.py +1072 -0
  1115. vllm/model_executor/models/swin.py +475 -0
  1116. vllm/model_executor/models/tarsier.py +639 -0
  1117. vllm/model_executor/models/telechat2.py +151 -0
  1118. vllm/model_executor/models/teleflm.py +79 -0
  1119. vllm/model_executor/models/terratorch.py +294 -0
  1120. vllm/model_executor/models/transformers.py +948 -0
  1121. vllm/model_executor/models/ultravox.py +654 -0
  1122. vllm/model_executor/models/utils.py +808 -0
  1123. vllm/model_executor/models/vision.py +404 -0
  1124. vllm/model_executor/models/voxtral.py +786 -0
  1125. vllm/model_executor/models/whisper.py +963 -0
  1126. vllm/model_executor/models/zamba2.py +960 -0
  1127. vllm/model_executor/parameter.py +620 -0
  1128. vllm/model_executor/utils.py +86 -0
  1129. vllm/model_executor/warmup/__init__.py +0 -0
  1130. vllm/model_executor/warmup/deep_gemm_warmup.py +230 -0
  1131. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1132. vllm/multimodal/__init__.py +33 -0
  1133. vllm/multimodal/audio.py +116 -0
  1134. vllm/multimodal/base.py +27 -0
  1135. vllm/multimodal/cache.py +697 -0
  1136. vllm/multimodal/evs.py +273 -0
  1137. vllm/multimodal/hasher.py +102 -0
  1138. vllm/multimodal/image.py +130 -0
  1139. vllm/multimodal/inputs.py +987 -0
  1140. vllm/multimodal/parse.py +511 -0
  1141. vllm/multimodal/processing.py +2148 -0
  1142. vllm/multimodal/profiling.py +284 -0
  1143. vllm/multimodal/registry.py +345 -0
  1144. vllm/multimodal/utils.py +503 -0
  1145. vllm/multimodal/video.py +319 -0
  1146. vllm/outputs.py +324 -0
  1147. vllm/platforms/__init__.py +263 -0
  1148. vllm/platforms/cpu.py +340 -0
  1149. vllm/platforms/cuda.py +668 -0
  1150. vllm/platforms/interface.py +620 -0
  1151. vllm/platforms/rocm.py +497 -0
  1152. vllm/platforms/tpu.py +233 -0
  1153. vllm/platforms/xpu.py +243 -0
  1154. vllm/plugins/__init__.py +72 -0
  1155. vllm/plugins/io_processors/__init__.py +68 -0
  1156. vllm/plugins/io_processors/interface.py +67 -0
  1157. vllm/plugins/lora_resolvers/README.md +16 -0
  1158. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1159. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1160. vllm/pooling_params.py +191 -0
  1161. vllm/profiler/__init__.py +0 -0
  1162. vllm/profiler/layerwise_profile.py +375 -0
  1163. vllm/profiler/utils.py +148 -0
  1164. vllm/py.typed +2 -0
  1165. vllm/ray/__init__.py +0 -0
  1166. vllm/ray/lazy_utils.py +22 -0
  1167. vllm/ray/ray_env.py +72 -0
  1168. vllm/reasoning/__init__.py +29 -0
  1169. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1170. vllm/reasoning/basic_parsers.py +156 -0
  1171. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1172. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1173. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1174. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1175. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1176. vllm/reasoning/mistral_reasoning_parser.py +56 -0
  1177. vllm/reasoning/qwen3_reasoning_parser.py +72 -0
  1178. vllm/reasoning/seedoss_reasoning_parser.py +28 -0
  1179. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1180. vllm/sampling_params.py +593 -0
  1181. vllm/scalar_type.py +349 -0
  1182. vllm/scripts.py +15 -0
  1183. vllm/sequence.py +103 -0
  1184. vllm/tasks.py +11 -0
  1185. vllm/test_utils.py +129 -0
  1186. vllm/third_party/__init__.py +0 -0
  1187. vllm/third_party/pynvml.py +6140 -0
  1188. vllm/tracing.py +136 -0
  1189. vllm/transformers_utils/__init__.py +24 -0
  1190. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1191. vllm/transformers_utils/chat_templates/registry.py +70 -0
  1192. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1193. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1194. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1195. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1196. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1197. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1198. vllm/transformers_utils/config.py +1102 -0
  1199. vllm/transformers_utils/config_parser_base.py +20 -0
  1200. vllm/transformers_utils/configs/__init__.py +63 -0
  1201. vllm/transformers_utils/configs/arctic.py +207 -0
  1202. vllm/transformers_utils/configs/chatglm.py +72 -0
  1203. vllm/transformers_utils/configs/deepseek_v3.py +101 -0
  1204. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1205. vllm/transformers_utils/configs/dotsocr.py +69 -0
  1206. vllm/transformers_utils/configs/eagle.py +84 -0
  1207. vllm/transformers_utils/configs/falcon.py +90 -0
  1208. vllm/transformers_utils/configs/jais.py +237 -0
  1209. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1210. vllm/transformers_utils/configs/medusa.py +63 -0
  1211. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1212. vllm/transformers_utils/configs/mistral.py +165 -0
  1213. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1214. vllm/transformers_utils/configs/moonvit.py +33 -0
  1215. vllm/transformers_utils/configs/nemotron.py +205 -0
  1216. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1217. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1218. vllm/transformers_utils/configs/olmo3.py +80 -0
  1219. vllm/transformers_utils/configs/ovis.py +176 -0
  1220. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1221. vllm/transformers_utils/configs/radio.py +91 -0
  1222. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1223. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1224. vllm/transformers_utils/configs/speculators/base.py +111 -0
  1225. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1226. vllm/transformers_utils/configs/ultravox.py +116 -0
  1227. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1228. vllm/transformers_utils/dynamic_module.py +60 -0
  1229. vllm/transformers_utils/processor.py +299 -0
  1230. vllm/transformers_utils/processors/__init__.py +16 -0
  1231. vllm/transformers_utils/processors/deepseek_vl2.py +362 -0
  1232. vllm/transformers_utils/processors/ovis.py +420 -0
  1233. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1234. vllm/transformers_utils/runai_utils.py +104 -0
  1235. vllm/transformers_utils/s3_utils.py +93 -0
  1236. vllm/transformers_utils/tokenizer.py +292 -0
  1237. vllm/transformers_utils/tokenizer_base.py +154 -0
  1238. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1239. vllm/transformers_utils/tokenizers/mistral.py +521 -0
  1240. vllm/transformers_utils/utils.py +108 -0
  1241. vllm/triton_utils/__init__.py +16 -0
  1242. vllm/triton_utils/importing.py +96 -0
  1243. vllm/usage/__init__.py +0 -0
  1244. vllm/usage/usage_lib.py +259 -0
  1245. vllm/utils/__init__.py +3566 -0
  1246. vllm/utils/deep_gemm.py +319 -0
  1247. vllm/utils/flashinfer.py +443 -0
  1248. vllm/utils/jsontree.py +178 -0
  1249. vllm/utils/tensor_schema.py +235 -0
  1250. vllm/v1/__init__.py +0 -0
  1251. vllm/v1/attention/__init__.py +0 -0
  1252. vllm/v1/attention/backends/__init__.py +0 -0
  1253. vllm/v1/attention/backends/cpu_attn.py +919 -0
  1254. vllm/v1/attention/backends/flash_attn.py +795 -0
  1255. vllm/v1/attention/backends/flashinfer.py +1181 -0
  1256. vllm/v1/attention/backends/flex_attention.py +861 -0
  1257. vllm/v1/attention/backends/gdn_attn.py +332 -0
  1258. vllm/v1/attention/backends/linear_attn.py +67 -0
  1259. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1260. vllm/v1/attention/backends/mamba2_attn.py +232 -0
  1261. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1262. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1263. vllm/v1/attention/backends/mla/common.py +1783 -0
  1264. vllm/v1/attention/backends/mla/cutlass_mla.py +248 -0
  1265. vllm/v1/attention/backends/mla/flashattn_mla.py +271 -0
  1266. vllm/v1/attention/backends/mla/flashinfer_mla.py +114 -0
  1267. vllm/v1/attention/backends/mla/flashmla.py +203 -0
  1268. vllm/v1/attention/backends/mla/flashmla_sparse.py +544 -0
  1269. vllm/v1/attention/backends/mla/indexer.py +342 -0
  1270. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1271. vllm/v1/attention/backends/mla/triton_mla.py +177 -0
  1272. vllm/v1/attention/backends/pallas.py +409 -0
  1273. vllm/v1/attention/backends/rocm_aiter_fa.py +549 -0
  1274. vllm/v1/attention/backends/rocm_attn.py +426 -0
  1275. vllm/v1/attention/backends/short_conv_attn.py +94 -0
  1276. vllm/v1/attention/backends/tree_attn.py +451 -0
  1277. vllm/v1/attention/backends/triton_attn.py +361 -0
  1278. vllm/v1/attention/backends/utils.py +990 -0
  1279. vllm/v1/attention/backends/xformers.py +438 -0
  1280. vllm/v1/core/__init__.py +0 -0
  1281. vllm/v1/core/block_pool.py +416 -0
  1282. vllm/v1/core/encoder_cache_manager.py +333 -0
  1283. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1284. vllm/v1/core/kv_cache_manager.py +399 -0
  1285. vllm/v1/core/kv_cache_utils.py +1291 -0
  1286. vllm/v1/core/sched/__init__.py +0 -0
  1287. vllm/v1/core/sched/async_scheduler.py +47 -0
  1288. vllm/v1/core/sched/interface.py +158 -0
  1289. vllm/v1/core/sched/output.py +166 -0
  1290. vllm/v1/core/sched/request_queue.py +224 -0
  1291. vllm/v1/core/sched/scheduler.py +1296 -0
  1292. vllm/v1/core/sched/utils.py +69 -0
  1293. vllm/v1/core/single_type_kv_cache_manager.py +671 -0
  1294. vllm/v1/cudagraph_dispatcher.py +125 -0
  1295. vllm/v1/engine/__init__.py +203 -0
  1296. vllm/v1/engine/async_llm.py +742 -0
  1297. vllm/v1/engine/coordinator.py +357 -0
  1298. vllm/v1/engine/core.py +1235 -0
  1299. vllm/v1/engine/core_client.py +1334 -0
  1300. vllm/v1/engine/detokenizer.py +349 -0
  1301. vllm/v1/engine/exceptions.py +17 -0
  1302. vllm/v1/engine/llm_engine.py +370 -0
  1303. vllm/v1/engine/logprobs.py +201 -0
  1304. vllm/v1/engine/output_processor.py +576 -0
  1305. vllm/v1/engine/parallel_sampling.py +133 -0
  1306. vllm/v1/engine/processor.py +545 -0
  1307. vllm/v1/engine/utils.py +860 -0
  1308. vllm/v1/executor/__init__.py +0 -0
  1309. vllm/v1/executor/abstract.py +137 -0
  1310. vllm/v1/executor/multiproc_executor.py +726 -0
  1311. vllm/v1/executor/ray_distributed_executor.py +108 -0
  1312. vllm/v1/executor/utils.py +23 -0
  1313. vllm/v1/kv_cache_interface.py +375 -0
  1314. vllm/v1/kv_offload/__init__.py +0 -0
  1315. vllm/v1/kv_offload/abstract.py +165 -0
  1316. vllm/v1/kv_offload/backend.py +96 -0
  1317. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1318. vllm/v1/kv_offload/backends/cpu.py +61 -0
  1319. vllm/v1/kv_offload/cpu.py +75 -0
  1320. vllm/v1/kv_offload/factory.py +56 -0
  1321. vllm/v1/kv_offload/lru_manager.py +132 -0
  1322. vllm/v1/kv_offload/mediums.py +39 -0
  1323. vllm/v1/kv_offload/spec.py +61 -0
  1324. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1325. vllm/v1/kv_offload/worker/cpu_gpu.py +171 -0
  1326. vllm/v1/kv_offload/worker/worker.py +142 -0
  1327. vllm/v1/metrics/__init__.py +0 -0
  1328. vllm/v1/metrics/loggers.py +741 -0
  1329. vllm/v1/metrics/prometheus.py +82 -0
  1330. vllm/v1/metrics/ray_wrappers.py +152 -0
  1331. vllm/v1/metrics/reader.py +246 -0
  1332. vllm/v1/metrics/stats.py +257 -0
  1333. vllm/v1/outputs.py +161 -0
  1334. vllm/v1/pool/__init__.py +0 -0
  1335. vllm/v1/pool/metadata.py +77 -0
  1336. vllm/v1/request.py +241 -0
  1337. vllm/v1/sample/__init__.py +0 -0
  1338. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1339. vllm/v1/sample/logits_processor/builtin.py +275 -0
  1340. vllm/v1/sample/logits_processor/interface.py +97 -0
  1341. vllm/v1/sample/logits_processor/state.py +161 -0
  1342. vllm/v1/sample/metadata.py +43 -0
  1343. vllm/v1/sample/ops/__init__.py +0 -0
  1344. vllm/v1/sample/ops/bad_words.py +39 -0
  1345. vllm/v1/sample/ops/logprobs.py +26 -0
  1346. vllm/v1/sample/ops/penalties.py +43 -0
  1347. vllm/v1/sample/ops/topk_topp_sampler.py +292 -0
  1348. vllm/v1/sample/rejection_sampler.py +623 -0
  1349. vllm/v1/sample/sampler.py +285 -0
  1350. vllm/v1/sample/tpu/__init__.py +0 -0
  1351. vllm/v1/sample/tpu/metadata.py +124 -0
  1352. vllm/v1/sample/tpu/sampler.py +213 -0
  1353. vllm/v1/serial_utils.py +423 -0
  1354. vllm/v1/spec_decode/__init__.py +0 -0
  1355. vllm/v1/spec_decode/eagle.py +1011 -0
  1356. vllm/v1/spec_decode/medusa.py +66 -0
  1357. vllm/v1/spec_decode/metadata.py +62 -0
  1358. vllm/v1/spec_decode/metrics.py +211 -0
  1359. vllm/v1/spec_decode/ngram_proposer.py +276 -0
  1360. vllm/v1/spec_decode/utils.py +14 -0
  1361. vllm/v1/structured_output/__init__.py +295 -0
  1362. vllm/v1/structured_output/backend_guidance.py +245 -0
  1363. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1364. vllm/v1/structured_output/backend_outlines.py +320 -0
  1365. vllm/v1/structured_output/backend_types.py +134 -0
  1366. vllm/v1/structured_output/backend_xgrammar.py +327 -0
  1367. vllm/v1/structured_output/request.py +86 -0
  1368. vllm/v1/structured_output/utils.py +454 -0
  1369. vllm/v1/utils.py +396 -0
  1370. vllm/v1/worker/__init__.py +0 -0
  1371. vllm/v1/worker/block_table.py +210 -0
  1372. vllm/v1/worker/cpu_model_runner.py +175 -0
  1373. vllm/v1/worker/cpu_worker.py +156 -0
  1374. vllm/v1/worker/gpu_input_batch.py +863 -0
  1375. vllm/v1/worker/gpu_model_runner.py +4160 -0
  1376. vllm/v1/worker/gpu_ubatch_wrapper.py +399 -0
  1377. vllm/v1/worker/gpu_worker.py +710 -0
  1378. vllm/v1/worker/kv_connector_model_runner_mixin.py +132 -0
  1379. vllm/v1/worker/lora_model_runner_mixin.py +183 -0
  1380. vllm/v1/worker/tpu_input_batch.py +587 -0
  1381. vllm/v1/worker/tpu_model_runner.py +1946 -0
  1382. vllm/v1/worker/tpu_worker.py +346 -0
  1383. vllm/v1/worker/ubatch_splitting.py +192 -0
  1384. vllm/v1/worker/ubatch_utils.py +27 -0
  1385. vllm/v1/worker/ubatching.py +224 -0
  1386. vllm/v1/worker/utils.py +344 -0
  1387. vllm/v1/worker/worker_base.py +65 -0
  1388. vllm/v1/worker/xpu_model_runner.py +57 -0
  1389. vllm/v1/worker/xpu_worker.py +179 -0
  1390. vllm/version.py +41 -0
  1391. vllm/vllm_flash_attn/.gitkeep +0 -0
  1392. vllm/worker/__init__.py +0 -0
  1393. vllm/worker/worker_base.py +279 -0
  1394. vllm_cpu-0.11.0.post2.dist-info/METADATA +348 -0
  1395. vllm_cpu-0.11.0.post2.dist-info/RECORD +1398 -0
  1396. vllm_cpu-0.11.0.post2.dist-info/WHEEL +5 -0
  1397. vllm_cpu-0.11.0.post2.dist-info/entry_points.txt +5 -0
  1398. vllm_cpu-0.11.0.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2723 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """
4
+ This module defines a framework for sampling benchmark requests from various
5
+ datasets. Each dataset subclass of BenchmarkDataset must implement sample
6
+ generation. Supported dataset types include:
7
+ - ShareGPT
8
+ - Random (synthetic)
9
+ - Sonnet
10
+ - BurstGPT
11
+ - HuggingFace
12
+ - VisionArena
13
+ """
14
+ import argparse
15
+ import ast
16
+ import base64
17
+ import io
18
+ import json
19
+ import logging
20
+ import math
21
+ import random
22
+ from abc import ABC, abstractmethod
23
+ from collections.abc import Iterator, Mapping
24
+ from contextlib import suppress
25
+ from copy import deepcopy
26
+ from dataclasses import dataclass
27
+ from functools import cache
28
+ from io import BytesIO
29
+ from typing import Any, Callable, Optional, Union, cast
30
+
31
+ import numpy as np
32
+ from PIL import Image
33
+ from transformers import PreTrainedTokenizerBase
34
+ from typing_extensions import deprecated
35
+
36
+ from vllm.lora.request import LoRARequest
37
+ from vllm.lora.utils import get_adapter_absolute_path
38
+ from vllm.multimodal import MultiModalDataDict
39
+ from vllm.multimodal.image import convert_image_mode
40
+ from vllm.transformers_utils.tokenizer import AnyTokenizer
41
+ from vllm.utils import PlaceholderModule
42
+
43
+ try:
44
+ from datasets import load_dataset
45
+ except ImportError:
46
+ datasets = PlaceholderModule("datasets")
47
+ load_dataset = datasets.placeholder_attr("load_dataset")
48
+
49
+ try:
50
+ import pandas as pd
51
+ except ImportError:
52
+ pd = PlaceholderModule("pandas")
53
+
54
+ try:
55
+ import librosa
56
+ except ImportError:
57
+ librosa = PlaceholderModule("librosa")
58
+
59
+ try:
60
+ from vllm.utils import FlexibleArgumentParser
61
+ except ImportError:
62
+ from argparse import ArgumentParser as FlexibleArgumentParser
63
+
64
+ logger = logging.getLogger(__name__)
65
+
66
+ # -----------------------------------------------------------------------------
67
+ # Data Classes
68
+ # -----------------------------------------------------------------------------
69
+
70
+
71
+ @dataclass
72
+ class SampleRequest:
73
+ """
74
+ Represents a single inference request for benchmarking.
75
+ """
76
+
77
+ prompt: Union[str, list[str]]
78
+ prompt_len: int
79
+ expected_output_len: int
80
+ multi_modal_data: Optional[
81
+ Union[MultiModalDataDict, dict, list[dict]]
82
+ ] = None
83
+ lora_request: Optional[LoRARequest] = None
84
+ request_id: Optional[str] = None
85
+
86
+
87
+ # -----------------------------------------------------------------------------
88
+ # Benchmark Dataset Base Class
89
+ # -----------------------------------------------------------------------------
90
+
91
+
92
+ class BenchmarkDataset(ABC):
93
+ DEFAULT_SEED = 0
94
+ IS_MULTIMODAL = False
95
+
96
+ def __init__(
97
+ self,
98
+ dataset_path: Optional[str] = None,
99
+ random_seed: int = DEFAULT_SEED,
100
+ ) -> None:
101
+ """
102
+ Initialize the BenchmarkDataset with an optional dataset path and random
103
+ seed.
104
+
105
+ Args:
106
+ dataset_path (Optional[str]): Path to the dataset. If None, it
107
+ indicates that a default or random dataset might be used.
108
+ random_seed (int): Seed value for reproducible shuffling or
109
+ sampling. Defaults to DEFAULT_SEED.
110
+ """
111
+ self.dataset_path = dataset_path
112
+ # Set the random seed, ensuring that a None value is replaced with the
113
+ # default seed.
114
+ self.random_seed = (random_seed
115
+ if random_seed is not None else self.DEFAULT_SEED)
116
+ self.data = None
117
+
118
+ def apply_multimodal_chat_transformation(
119
+ self,
120
+ prompt: str,
121
+ mm_content: Optional[
122
+ Union[MultiModalDataDict, dict, list[dict]]
123
+ ] = None) -> list[dict]:
124
+ """
125
+ Transform a prompt and optional multimodal content into a chat format.
126
+ This method is used for chat models that expect a specific conversation
127
+ format.
128
+ """
129
+ content = [{"text": prompt, "type": "text"}]
130
+ if mm_content is not None:
131
+ if isinstance(mm_content, list):
132
+ content.extend(cast(list[dict[str, Any]], mm_content))
133
+ elif isinstance(mm_content, dict):
134
+ content.append(mm_content)
135
+ else:
136
+ raise TypeError(
137
+ "Could not process multimodal content of type: " +
138
+ f"{type(mm_content)}"
139
+ )
140
+ return [{"role": "user", "content": content}]
141
+
142
+ def load_data(self) -> None:
143
+ """
144
+ Load data from the dataset path into self.data.
145
+
146
+ This method must be overridden by subclasses since the method to load
147
+ data will vary depending on the dataset format and source.
148
+
149
+ Raises:
150
+ NotImplementedError: If a subclass does not implement this method.
151
+ """
152
+ # TODO (jenniferzhao): add support for downloading data
153
+ raise NotImplementedError(
154
+ "load_data must be implemented in subclasses.")
155
+
156
+ def get_random_lora_request(
157
+ self,
158
+ max_loras: Optional[int] = None,
159
+ lora_path: Optional[str] = None,
160
+ ) -> Optional[LoRARequest]:
161
+ """
162
+ Optionally select a random LoRA request.
163
+
164
+ This method is used when LoRA parameters are provided. It randomly
165
+ selects a LoRA based on max_loras.
166
+
167
+ Args:
168
+ max_loras (Optional[int]): The maximum number of LoRAs available.
169
+ If `None`, LoRA is not used.
170
+ lora_path (Optional[str]): Path to the LoRA parameters on disk.
171
+ If `None`, LoRA is not used.
172
+
173
+ Returns:
174
+ A new [`LoRARequest`][vllm.lora.request.LoRARequest]
175
+ (or `None` if not applicable).
176
+ """
177
+ if max_loras is None or lora_path is None:
178
+ return None
179
+
180
+ # Generate a random LoRA ID in the range [1, max_loras].
181
+ lora_id = random.randint(1, max_loras)
182
+ lora_request = LoRARequest(
183
+ lora_name=str(lora_id),
184
+ lora_int_id=lora_id,
185
+ lora_path=lora_path_on_disk(lora_path),
186
+ )
187
+ return lora_request
188
+
189
+ @abstractmethod
190
+ def sample(self, tokenizer: PreTrainedTokenizerBase,
191
+ num_requests: int,
192
+ request_id_prefix: str = "",
193
+ no_oversample: bool = False) -> list[SampleRequest]:
194
+ """
195
+ Abstract method to generate sample requests from the dataset.
196
+
197
+ Subclasses must override this method to implement dataset-specific logic
198
+ for generating a list of SampleRequest objects.
199
+
200
+ Args:
201
+ tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
202
+ for processing the dataset's text.
203
+ num_requests (int): The number of sample requests to generate.
204
+ request_id_prefix (str): The prefix of request_id.
205
+
206
+ Returns:
207
+ list[SampleRequest]: A list of sample requests generated from the
208
+ dataset.
209
+ """
210
+ raise NotImplementedError("sample must be implemented in subclasses.")
211
+
212
+ def maybe_oversample_requests(
213
+ self,
214
+ requests: list[SampleRequest],
215
+ num_requests: int,
216
+ request_id_prefix: str = "",
217
+ no_oversample: bool = False,
218
+ ) -> None:
219
+ """
220
+ Oversamples the list of requests if its size is less than the desired
221
+ number.
222
+
223
+ Args:
224
+ requests (List[SampleRequest]): The current list of sampled
225
+ requests.
226
+ num_requests (int): The target number of requests.
227
+ request_id_prefix (str): The prefix applied to generated request
228
+ identifiers.
229
+
230
+ """
231
+ if no_oversample:
232
+ logger.info("Skipping oversampling. " \
233
+ "Total samples: %d.", len(requests))
234
+ return
235
+
236
+ if len(requests) < num_requests:
237
+ random.seed(self.random_seed)
238
+ additional = deepcopy(
239
+ random.choices(requests, k=num_requests - len(requests))
240
+ )
241
+ for i in range(len(additional)):
242
+ req = additional[i]
243
+ req.request_id = request_id_prefix + str(len(requests) + i)
244
+ requests.extend(additional)
245
+ logger.info("Oversampled requests to reach %d total samples.",
246
+ num_requests)
247
+
248
+
249
+ # -----------------------------------------------------------------------------
250
+ # Utility Functions and Global Caches
251
+ # -----------------------------------------------------------------------------
252
+
253
+
254
+ def is_valid_sequence(
255
+ prompt_len: int,
256
+ output_len: int,
257
+ min_len: int = 4,
258
+ max_prompt_len: int = 1024,
259
+ max_total_len: int = 2048,
260
+ skip_min_output_len_check: bool = False,
261
+ ) -> bool:
262
+ """
263
+ Validate a sequence based on prompt and output lengths.
264
+
265
+ Default pruning criteria are copied from the original `sample_hf_requests`
266
+ and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
267
+ from `sample_requests` in benchmark_throughput.py.
268
+ """
269
+ # Check for invalid conditions
270
+ prompt_too_short = prompt_len < min_len
271
+ output_too_short = (not skip_min_output_len_check) and (output_len
272
+ < min_len)
273
+ prompt_too_long = prompt_len > max_prompt_len
274
+ combined_too_long = (prompt_len + output_len) > max_total_len
275
+
276
+ # Return True if none of the invalid conditions are met
277
+ return not (prompt_too_short or output_too_short or prompt_too_long
278
+ or combined_too_long)
279
+
280
+
281
+ @cache
282
+ def lora_path_on_disk(lora_path: str) -> str:
283
+ return get_adapter_absolute_path(lora_path)
284
+
285
+
286
+ # Global cache for LoRA tokenizers.
287
+ lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
288
+
289
+
290
+ def process_image(image: Any) -> Mapping[str, Any]:
291
+ """
292
+ Process a single image input and return a multimedia content dictionary.
293
+
294
+ Supports the following input types:
295
+
296
+ 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
297
+ containing raw image data. - Loads the bytes as a PIL.Image.Image.
298
+
299
+ 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
300
+ a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
301
+ a dictionary with the image as a base64 data URL.
302
+
303
+ 3. String input: - Treats the string as a URL or local file path. -
304
+ Prepends "file://" if the string doesn't start with "http://" or
305
+ "file://". - Returns a dictionary with the image URL.
306
+
307
+ Raises:
308
+ ValueError: If the input is not a supported type.
309
+ """
310
+ if isinstance(image, dict) and 'bytes' in image:
311
+ image = Image.open(BytesIO(image['bytes']))
312
+ if isinstance(image, Image.Image):
313
+ image = convert_image_mode(image, "RGB")
314
+ with io.BytesIO() as image_data:
315
+ image.save(image_data, format="JPEG")
316
+ image_base64 = base64.b64encode(
317
+ image_data.getvalue()).decode("utf-8")
318
+ return {
319
+ "type": "image_url",
320
+ "image_url": {
321
+ "url": f"data:image/jpeg;base64,{image_base64}"
322
+ },
323
+ }
324
+
325
+ if isinstance(image, str):
326
+ image_url = (image if image.startswith(
327
+ ("http://", "https://", "file://")) else f"file://{image}")
328
+ return {"type": "image_url", "image_url": {"url": image_url}}
329
+
330
+ raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
331
+ " or str or dictionary with raw image bytes.")
332
+
333
+
334
+ def process_video(video: Any) -> Mapping[str, Any]:
335
+ """
336
+ Process a single video input and return a multimedia content dictionary.
337
+
338
+ Supports the following input types:
339
+
340
+ 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
341
+ containing raw video data.
342
+
343
+ 2. String input: - Treats the string as a URL or local file path. -
344
+ Prepends "file://" if the string doesn't start with "http://" or
345
+ "file://". - Returns a dictionary with the image URL.
346
+
347
+ Raises:
348
+ ValueError: If the input is not a supported type.
349
+ """
350
+ if isinstance(video, dict) and 'bytes' in video:
351
+ video_bytes = video['bytes']
352
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
353
+ return {
354
+ "type": "video_url",
355
+ "video_url": {
356
+ "url": f"data:video/mp4;base64,{video_base64}"
357
+ },
358
+ }
359
+
360
+ if isinstance(video, str):
361
+ video_url = (video if video.startswith(
362
+ ("http://", "https://", "file://")) else f"file://{video}")
363
+ return {"type": "video_url", "video_url": {"url": video_url}}
364
+
365
+ raise ValueError(
366
+ f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
367
+ )
368
+
369
+ # -----------------------------------------------------------------------------
370
+ # Random Dataset Implementation (Synthetic Data)
371
+ # -----------------------------------------------------------------------------
372
+
373
+
374
+ class RandomDataset(BenchmarkDataset):
375
+ """
376
+ Synthetic text-only dataset for serving/throughput benchmarks.
377
+
378
+ Strategy:
379
+ - Sample input/output token lengths per request from integer-uniform ranges
380
+ around configured means (controlled by range_ratio).
381
+ - Prepend a fixed random prefix of length prefix_len.
382
+ - Generate the remaining tokens as a reproducible sequence:
383
+ (offset + index + arange(input_len)) % vocab_size.
384
+ - Decode then re-encode/truncate to ensure prompt token counts match.
385
+ - Uses numpy.default_rng seeded with random_seed for reproducible sampling.
386
+ """
387
+ # Default values copied from benchmark_serving.py for the random dataset.
388
+ DEFAULT_PREFIX_LEN = 0
389
+ DEFAULT_RANGE_RATIO = 0.0
390
+ DEFAULT_INPUT_LEN = 1024
391
+ DEFAULT_OUTPUT_LEN = 128
392
+
393
+ def __init__(self, **kwargs) -> None:
394
+ super().__init__(**kwargs)
395
+ # Use numpy's default_rng for deterministic sampling
396
+ # Do not use random.seed() or np.random.seed() elsewhere in this class.
397
+ # This ensures that the RNG is isolated from global RNG state.
398
+ self._rng = np.random.default_rng(self.random_seed)
399
+
400
+ def sample(
401
+ self,
402
+ tokenizer: PreTrainedTokenizerBase,
403
+ num_requests: int,
404
+ request_id_prefix: str = "",
405
+ no_oversample: bool = False,
406
+ prefix_len: int = DEFAULT_PREFIX_LEN,
407
+ range_ratio: float = DEFAULT_RANGE_RATIO,
408
+ input_len: int = DEFAULT_INPUT_LEN,
409
+ output_len: int = DEFAULT_OUTPUT_LEN,
410
+ batchsize: int = 1,
411
+ **kwargs,
412
+ ) -> list[SampleRequest]:
413
+
414
+ input_lens, output_lens, offsets = self.get_sampling_params(
415
+ num_requests, range_ratio, input_len, output_len, tokenizer
416
+ )
417
+
418
+ # Generate prefix once
419
+ prefix_token_ids = self.get_prefix(tokenizer, prefix_len)
420
+ vocab_size = tokenizer.vocab_size
421
+
422
+ requests = []
423
+ for i in range(num_requests):
424
+ prompt, total_input_len = self.generate_token_sequence(
425
+ tokenizer=tokenizer,
426
+ prefix_token_ids=prefix_token_ids,
427
+ prefix_len=prefix_len,
428
+ vocab_size=vocab_size,
429
+ input_len=int(input_lens[i]),
430
+ offset=int(offsets[i]),
431
+ index=i,
432
+ )
433
+ requests.append(
434
+ SampleRequest(
435
+ prompt=prompt,
436
+ prompt_len=total_input_len,
437
+ expected_output_len=int(output_lens[i]),
438
+ request_id=request_id_prefix + str(i),
439
+ )
440
+ )
441
+ # only used for embeddings benchmark.
442
+ if batchsize > 1:
443
+ batch_requests = []
444
+ # Create batched requests
445
+ for i in range(0, num_requests, batchsize):
446
+ batch = requests[i : i + batchsize]
447
+ batch_requests.append(
448
+ SampleRequest(
449
+ prompt=[req.prompt for req in batch],
450
+ prompt_len=sum(req.prompt_len for req in batch),
451
+ expected_output_len=0,
452
+ request_id=request_id_prefix + str(i // batchsize),
453
+ )
454
+ )
455
+ requests = batch_requests
456
+ return requests
457
+
458
+ def get_prefix(
459
+ self, tokenizer: PreTrainedTokenizerBase, prefix_len: int
460
+ ) -> list[int]:
461
+ """
462
+ Get the prefix for the dataset.
463
+ """
464
+ return (
465
+ self._rng.integers(
466
+ 0, tokenizer.vocab_size, size=prefix_len).tolist()
467
+ if prefix_len > 0
468
+ else []
469
+ )
470
+
471
+ def get_sampling_params(
472
+ self,
473
+ num_requests: int,
474
+ range_ratio: float,
475
+ input_len: int,
476
+ output_len: int,
477
+ tokenizer: PreTrainedTokenizerBase,
478
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
479
+ """
480
+ Get the sampling parameters for the dataset.
481
+ """
482
+ # Enforce range_ratio < 1
483
+ if not (0.0 <= range_ratio < 1.0):
484
+ raise ValueError("range_ratio must be in [0, 1).")
485
+ num_special_tokens = int(tokenizer.num_special_tokens_to_add())
486
+ real_input_len = max(0, int(input_len) - num_special_tokens)
487
+ # Bounds use floor for low and ceil for high
488
+ input_low = math.floor(real_input_len * (1 - range_ratio))
489
+ input_high = math.ceil(real_input_len * (1 + range_ratio))
490
+ output_low = math.floor(output_len * (1 - range_ratio))
491
+ output_high = math.ceil(output_len * (1 + range_ratio))
492
+ # Ensure the lower bound for output length is at least 1 to
493
+ # prevent sampling 0 tokens.
494
+ output_low = max(output_low, 1)
495
+
496
+ if input_low > input_high:
497
+ raise ValueError(
498
+ "Invalid input sampling interval: "
499
+ f"low={input_low} > high={input_high}"
500
+ )
501
+ if output_low > output_high:
502
+ raise ValueError(
503
+ "Invalid output sampling interval: "
504
+ f"low={output_low} > high={output_high}"
505
+ )
506
+
507
+ logger.info(
508
+ "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
509
+ input_low,
510
+ input_high,
511
+ output_low,
512
+ output_high,
513
+ )
514
+
515
+ input_lens = self._rng.integers(input_low, input_high + 1,
516
+ size=num_requests)
517
+ output_lens = self._rng.integers(output_low, output_high + 1,
518
+ size=num_requests)
519
+ offsets = self._rng.integers(0, tokenizer.vocab_size,
520
+ size=num_requests)
521
+ return input_lens, output_lens, offsets
522
+
523
+ def generate_token_sequence(
524
+ self,
525
+ *,
526
+ tokenizer: PreTrainedTokenizerBase,
527
+ prefix_token_ids: list[int],
528
+ prefix_len: int,
529
+ vocab_size: int,
530
+ input_len: int,
531
+ offset: int,
532
+ index: int,
533
+ ) -> tuple[str, int]:
534
+ """
535
+ Returns (prompt, total_input_len).
536
+
537
+ NOTE: After decoding the prompt we have to encode and decode it again.
538
+ This is done because in some cases N consecutive tokens
539
+ give a string tokenized into != N number of tokens.
540
+ For example for GPT2Tokenizer:
541
+ [6880, 6881] -> ['Ġcalls', 'here'] ->
542
+ [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
543
+ To avoid uncontrolled change of the prompt length,
544
+ the encoded sequence is truncated before being decoded again.
545
+ """
546
+ # Build the inner sequence by sampling sequentially from the vocab
547
+ inner_seq = ((offset + index + np.arange(input_len))
548
+ % vocab_size).tolist()
549
+ token_sequence = prefix_token_ids + inner_seq
550
+
551
+ # Decode, then re-encode and truncate to preserve token count invariants
552
+ prompt = tokenizer.decode(token_sequence)
553
+ total_input_len = prefix_len + int(input_len)
554
+
555
+ re_encoded_sequence = tokenizer.encode(
556
+ prompt, add_special_tokens=False)[:total_input_len]
557
+ prompt = tokenizer.decode(re_encoded_sequence)
558
+ total_input_len = len(re_encoded_sequence)
559
+
560
+ return prompt, total_input_len
561
+
562
+
563
+ # -----------------------------------------------------------------------------
564
+ # MultiModalDataset Implementation
565
+ # -----------------------------------------------------------------------------
566
+
567
+ class RandomMultiModalDataset(RandomDataset):
568
+ """
569
+ Synthetic multimodal dataset (text + images) that extends RandomDataset.
570
+
571
+ Status:
572
+ - Images: supported via synthetic RGB data.
573
+ - Video: not yet supported (TODO: implement video generation method).
574
+ - Audio: not yet supported.
575
+
576
+ Sampling overview:
577
+ 1) Number of items per request is sampled uniformly from the integer range
578
+ [floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is
579
+ `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0.
580
+ The maximum is further clamped to the sum of per-modality limits.
581
+ 2) Each item’s modality and shape is sampled from `bucket_config`, a dict
582
+ mapping (height, width, num_frames) → probability. We treat
583
+ `num_frames`=1 as image and and `num_frames` > 1 as video.
584
+ Entries with zero probability are removed and the rest are renormalized
585
+ to sum to 1.
586
+ 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
587
+ When a modality reaches its cap, all of its buckets are excluded and the
588
+ remaining probabilities are renormalized.
589
+
590
+ Example bucket configuration:
591
+ {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1}
592
+ - Two image buckets (`num_frames`=1) and one video bucket
593
+ (`num_frames`=16).
594
+ OBS.: Only image sampling is supported for now.
595
+ """
596
+
597
+ IS_MULTIMODAL = True
598
+ # NOTE: video sampling is WIP. Setting it to 0.
599
+ DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 0}
600
+
601
+ DEFAULT_BASE_ITEMS_PER_REQUEST = 1
602
+ DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0
603
+ DEFAULT_MM_ITEM_BUCKET_CONFIG = {
604
+ (256, 256, 1): 0.5,
605
+ (720, 1280, 1): 0.5,
606
+ (720, 1280, 16): 0.0,
607
+ }
608
+ DEFAULT_ENABLE_MULTIMODAL_CHAT = False
609
+
610
+ def __init__(self, **kwargs) -> None:
611
+ super().__init__(**kwargs)
612
+
613
+
614
+ def generate_synthetic_image(self, width: int, height: int) -> Image.Image:
615
+ """Generate synthetic PIL image with random RGB values.
616
+
617
+ NOTE: iid pixel sampling results in worst-case compression
618
+ (good for stressing I/O), but very unlike real photos.
619
+ We could consider a “low-freq” mode (e.g., noise blur)
620
+ to emulate network realism instead of max stress.
621
+ """
622
+ random_pixels = self._rng.integers(
623
+ 0,
624
+ 256,
625
+ (height, width, 3),
626
+ dtype=np.uint8,
627
+ )
628
+ return Image.fromarray(random_pixels)
629
+
630
+ def generate_synthetic_video(self, width: int,
631
+ height: int,
632
+ num_frames: int) -> Any:
633
+ """Generate synthetic video with random values.
634
+
635
+ TODO: Finish this method.
636
+ """
637
+ raise NotImplementedError("Video sampling is WIP.")
638
+
639
+ def map_config_to_modality(self, config: tuple[int, int, int]) -> str:
640
+ """Map the configuration to the modality."""
641
+ if config[-1] == 1:
642
+ return "image"
643
+ elif config[-1] > 1:
644
+ return "video"
645
+ else:
646
+ raise ValueError(f"Invalid multimodal item configuration: {config}")
647
+
648
+ def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int],
649
+ float]) -> dict[tuple[int, int, int], float]:
650
+ """
651
+ Remove zero probability entries
652
+ and normalize the bucket config to sum to 1.
653
+ """
654
+ # Raise error if value is negative
655
+ if any(v < 0 for v in bucket_config.values()):
656
+ raise ValueError("Bucket config values must be non-negative.")
657
+ # Remove zero probability entries
658
+ bucket_config = {k: v for k, v in bucket_config.items() if v > 0}
659
+ # if bucket config is empty, raise error
660
+ if not bucket_config:
661
+ raise ValueError("Got invalid bucket config. "
662
+ "Bucket config values must be non-zero.")
663
+ # Normalize the remaining bucket config to sum to 1
664
+ total = sum(bucket_config.values())
665
+ return {k: v / total for k, v in bucket_config.items()}
666
+
667
+
668
+ def generate_mm_item(self,
669
+ mm_item_config: tuple[int, int, int],
670
+ ) -> Mapping[str, Any]:
671
+ """
672
+ Create synthetic images and videos and
673
+ apply process_image/process_video respectively.
674
+ This follows the OpenAI API chat completions
675
+ https://github.com/openai/openai-python
676
+ """
677
+
678
+ if self.map_config_to_modality(mm_item_config) == "image":
679
+ return process_image(self.generate_synthetic_image(
680
+ mm_item_config[1],
681
+ mm_item_config[0]))
682
+ elif self.map_config_to_modality(mm_item_config) == "video":
683
+ return process_video(self.generate_synthetic_video(
684
+ mm_item_config[1],
685
+ mm_item_config[0],
686
+ mm_item_config[2]))
687
+ else:
688
+ raise ValueError(f"Invalid multimodal item configuration: "
689
+ f"{mm_item_config}")
690
+
691
+
692
+ def get_mm_item_sampling_params(
693
+ self,
694
+ base_items_per_request: int,
695
+ num_mm_items_range_ratio: float,
696
+ limit_mm_per_prompt: dict[str, int],
697
+ bucket_config: dict[tuple[int, int, int], float],
698
+ ) -> tuple[int, int, dict[str, int], dict[tuple[int, int, int], float]]:
699
+ """
700
+ Get the sampling parameters for the multimodal items.
701
+ """
702
+ # Enforce num_mm_items_range_ratio <= 1
703
+ if not (0.0 <= num_mm_items_range_ratio <= 1.0):
704
+ raise ValueError("num_mm_items_range_ratio must be in [0, 1].")
705
+
706
+ # Ensure modalities to sample are in limit_mm_per_prompt
707
+ for k, v in bucket_config.items():
708
+ # get modality from bucket config
709
+ modality = self.map_config_to_modality(k)
710
+ if modality not in limit_mm_per_prompt:
711
+ raise ValueError(f"Modality {modality} is not in "
712
+ f"limit_mm_per_prompt: "
713
+ f"{limit_mm_per_prompt.keys()}")
714
+
715
+ # Remove zero probability entries
716
+ # and normalize bucket config to sum to 1
717
+ bucket_config = self.normalize_bucket_config(bucket_config)
718
+ logger.info(
719
+ "Normalized bucket config: %s", bucket_config,
720
+ )
721
+ # Only consider limit per prompt for modalities in bucket config
722
+ allowed_modalities = {self.map_config_to_modality(cfg)
723
+ for cfg in bucket_config}
724
+ limit_mm_per_prompt = {
725
+ k: v for k, v in limit_mm_per_prompt.items()
726
+ if k in allowed_modalities}
727
+ if not limit_mm_per_prompt:
728
+ raise ValueError("No valid limits for modalities present in "
729
+ "bucket_config.")
730
+
731
+ logger.info(
732
+ "Updated mm-limit-per-prompt: %s", limit_mm_per_prompt,
733
+ )
734
+
735
+ # Get max and min num mm items and ensure
736
+ # it is at most the sum of limit_mm_per_prompt for all modalities
737
+ max_num_mm_items = min(
738
+ sum(limit_mm_per_prompt.values()),
739
+ math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio))
740
+ )
741
+ # Ensure min num mm items is at least 0
742
+ min_num_mm_items = max(
743
+ 0,
744
+ math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
745
+ )
746
+ # Raise error if min num mm items is greater than max num mm items
747
+ if min_num_mm_items > max_num_mm_items:
748
+ raise ValueError(f"Min num mm items is greater than max mm items: "
749
+ f"{min_num_mm_items} > {max_num_mm_items}")
750
+
751
+ logger.info(
752
+ "Sampling number of multimodal items from [%s, %s]",
753
+ min_num_mm_items, max_num_mm_items,
754
+ )
755
+
756
+ return (
757
+ min_num_mm_items,
758
+ max_num_mm_items,
759
+ limit_mm_per_prompt,
760
+ bucket_config,
761
+ )
762
+
763
+ def get_mm_item_iterator(
764
+ self,
765
+ min_num_mm_items: int,
766
+ max_num_mm_items: int,
767
+ bucket_config: dict[tuple[int, int, int], float],
768
+ limit_mm_per_prompt: dict[str, int],
769
+ ) -> Iterator[tuple[int,int, int]]:
770
+ """
771
+ Iterator over the multimodal items for each request
772
+ whose size is between min_num_mm_items and max_num_mm_items.
773
+
774
+ Loop over the bucket config and sample a multimodal item.
775
+ Loop until the number of multimodal items sampled is equal to
776
+ request_num_mm_items or limit of multimodal items per prompt
777
+ for all modalities is reached.
778
+
779
+ Note:
780
+ - This function operates on a per-request shallow copy of
781
+ `bucket_config` (tuple->float). The original dict passed to
782
+ `sample` is not mutated. If this ever changes, a test
783
+ is implemented and will fail.
784
+ """
785
+ # Get the number of multimodal items to sample
786
+ request_num_mm_items = int(
787
+ self._rng.integers(min_num_mm_items, max_num_mm_items + 1)
788
+ )
789
+ # If request_num_mm_items is 0, yield an empty iterator
790
+ if request_num_mm_items == 0:
791
+ return
792
+ # Initialize modality counters
793
+ modality_counter = {self.map_config_to_modality(k): 0
794
+ for k in bucket_config}
795
+ # Copy the bucket config to avoid modifying the original
796
+ bucket_config_copy = bucket_config.copy()
797
+ # Loop over the number of multimodal items to sample
798
+ while sum(modality_counter.values()) < request_num_mm_items:
799
+ # Sample a multimodal item config
800
+ mm_item_config = self._rng.choice(list(bucket_config_copy.keys()),
801
+ p=list(bucket_config_copy.values()))
802
+ modality = self.map_config_to_modality(mm_item_config)
803
+ # Check that modality count is less than limit per prompt
804
+ if modality_counter[modality] < limit_mm_per_prompt[modality]:
805
+ modality_counter[modality] += 1
806
+ yield (
807
+ mm_item_config
808
+ )
809
+ else:
810
+ # If the counter is greater than the limit per prompt
811
+ # set all multimodal items of this modality to 0
812
+ for k, v in bucket_config_copy.items():
813
+ if self.map_config_to_modality(k) == modality:
814
+ bucket_config_copy[k] = 0
815
+ # If all configs are 0, break the loop
816
+ # This should not happen as request_num_mm_items is at most
817
+ # the sum of limit_mm_per_prompt for all modalities
818
+ if all(v == 0 for v in bucket_config_copy.values()):
819
+ logger.warning("Exhausted all multimodal items "
820
+ "of modality %s",
821
+ modality)
822
+ break
823
+ # Renormalize the bucket config
824
+ bucket_config_copy = self.normalize_bucket_config(
825
+ bucket_config_copy)
826
+
827
+
828
+ def sample(
829
+ self,
830
+ tokenizer: PreTrainedTokenizerBase,
831
+ num_requests: int,
832
+ request_id_prefix: str = "",
833
+ no_oversample: bool = False,
834
+ prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
835
+ range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
836
+ input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
837
+ output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN,
838
+ limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
839
+ base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
840
+ num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
841
+ bucket_config: dict[tuple[int, int, int], float] =
842
+ DEFAULT_MM_ITEM_BUCKET_CONFIG,
843
+ enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
844
+ **kwargs,
845
+ ) -> list[SampleRequest]:
846
+
847
+ # NOTE: Video sampling is WIP. Raise error if video is in bucket config
848
+ # and probability is non-zero.
849
+ if any(self.map_config_to_modality(cfg) == "video" and p > 0
850
+ for cfg, p in bucket_config.items()):
851
+ raise NotImplementedError("Video sampling not implemented; "
852
+ "set its probability to 0.")
853
+
854
+ # Get the sampling parameters for the dataset
855
+ input_lens, output_lens, offsets = self.get_sampling_params(
856
+ num_requests, range_ratio, input_len, output_len, tokenizer
857
+ )
858
+
859
+ (
860
+ min_num_mm_items,
861
+ max_num_mm_items,
862
+ limit_mm_per_prompt,
863
+ bucket_config,
864
+ ) = self.get_mm_item_sampling_params(
865
+ base_items_per_request,
866
+ num_mm_items_range_ratio,
867
+ limit_mm_per_prompt,
868
+ bucket_config,
869
+ )
870
+
871
+ # Generate prefix once
872
+ prefix_token_ids = self.get_prefix(tokenizer, prefix_len)
873
+ vocab_size = tokenizer.vocab_size
874
+ # Add synthetic multimodal items to each request
875
+ mm_requests = []
876
+ for i in range(num_requests):
877
+ prompt, total_input_len = self.generate_token_sequence(
878
+ tokenizer=tokenizer,
879
+ prefix_token_ids=prefix_token_ids,
880
+ prefix_len=prefix_len,
881
+ vocab_size=vocab_size,
882
+ input_len=int(input_lens[i]),
883
+ offset=int(offsets[i]),
884
+ index=i,
885
+ )
886
+ # Get multimodal item iterator for a given request
887
+ mm_item_iterator = self.get_mm_item_iterator(
888
+ min_num_mm_items,
889
+ max_num_mm_items,
890
+ bucket_config,
891
+ limit_mm_per_prompt,
892
+ )
893
+
894
+ mm_content = cast(list[dict[str, Any]], [
895
+ self.generate_mm_item(mm_item_config)
896
+ for mm_item_config in mm_item_iterator
897
+ ])
898
+
899
+ if enable_multimodal_chat:
900
+ # NOTE: For now this option is only provided for completeness
901
+ # given that the serve.py benchmark currently does not use it.
902
+ mm_chat_prompt: Any = prompt
903
+ mm_chat_prompt = self.apply_multimodal_chat_transformation(
904
+ prompt, mm_content)
905
+ sample_request = SampleRequest(
906
+ prompt=mm_chat_prompt,
907
+ prompt_len=total_input_len,
908
+ expected_output_len=int(output_lens[i]),
909
+ multi_modal_data=None,
910
+ request_id=request_id_prefix + str(i),
911
+ )
912
+ else:
913
+ sample_request = SampleRequest(
914
+ prompt=prompt,
915
+ prompt_len=total_input_len,
916
+ expected_output_len=int(output_lens[i]),
917
+ multi_modal_data=mm_content,
918
+ request_id=request_id_prefix + str(i),
919
+ )
920
+ mm_requests.append(sample_request)
921
+ return mm_requests
922
+
923
+ # -----------------------------------------------------------------------------
924
+ # ShareGPT Dataset Implementation
925
+ # -----------------------------------------------------------------------------
926
+
927
+
928
+ class ShareGPTDataset(BenchmarkDataset):
929
+ """
930
+ Implements the ShareGPT dataset. Loads data from a JSON file and generates
931
+ sample requests based on conversation turns.
932
+ """
933
+
934
+ def __init__(self, **kwargs) -> None:
935
+ super().__init__(**kwargs)
936
+ self.load_data()
937
+
938
+ def load_data(self) -> None:
939
+ if self.dataset_path is None:
940
+ raise ValueError("dataset_path must be provided for loading data.")
941
+
942
+ with open(self.dataset_path, encoding="utf-8") as f:
943
+ self.data = json.load(f)
944
+ # Filter entries with at least two conversation turns.
945
+ self.data = [
946
+ entry for entry in self.data
947
+ if "conversations" in entry and len(entry["conversations"]) >= 2
948
+ ]
949
+ random.seed(self.random_seed)
950
+ random.shuffle(self.data)
951
+
952
+ def sample(
953
+ self,
954
+ tokenizer: PreTrainedTokenizerBase,
955
+ num_requests: int,
956
+ lora_path: Optional[str] = None,
957
+ max_loras: Optional[int] = None,
958
+ output_len: Optional[int] = None,
959
+ enable_multimodal_chat: bool = False,
960
+ request_id_prefix: str = "",
961
+ no_oversample: bool = False,
962
+ **kwargs,
963
+ ) -> list:
964
+ samples: list = []
965
+ ind = 0
966
+ for entry in self.data:
967
+ if len(samples) >= num_requests:
968
+ break
969
+ prompt, completion = (
970
+ entry["conversations"][0]["value"],
971
+ entry["conversations"][1]["value"],
972
+ )
973
+
974
+ lora_request = self.get_random_lora_request(
975
+ max_loras=max_loras, lora_path=lora_path)
976
+ prompt_ids = tokenizer(prompt).input_ids
977
+ completion_ids = tokenizer(completion).input_ids
978
+ prompt_len = len(prompt_ids)
979
+ new_output_len = (len(completion_ids)
980
+ if output_len is None else output_len)
981
+ if not is_valid_sequence(prompt_len,
982
+ new_output_len,
983
+ skip_min_output_len_check=output_len
984
+ is not None):
985
+ continue
986
+ if image_path := entry.get("image"):
987
+ mm_content = process_image(image_path)
988
+ elif video_path := entry.get("video"):
989
+ mm_content = process_video(video_path)
990
+ else:
991
+ mm_content = None
992
+ if enable_multimodal_chat:
993
+ prompt = self.apply_multimodal_chat_transformation(
994
+ prompt, mm_content)
995
+ samples.append(
996
+ SampleRequest(
997
+ prompt=prompt,
998
+ prompt_len=prompt_len,
999
+ expected_output_len=new_output_len,
1000
+ lora_request=lora_request,
1001
+ multi_modal_data=mm_content,
1002
+ request_id=request_id_prefix + str(ind),
1003
+ ))
1004
+ ind += 1
1005
+ self.maybe_oversample_requests(samples,
1006
+ num_requests,
1007
+ request_id_prefix,
1008
+ no_oversample)
1009
+ return samples
1010
+
1011
+
1012
+ class _ValidateDatasetArgs(argparse.Action):
1013
+ """Argparse action to validate dataset name and path compatibility."""
1014
+ def __call__(self, parser, namespace, values, option_string=None):
1015
+ setattr(namespace, self.dest, values)
1016
+
1017
+ # Get current values of both dataset_name and dataset_path
1018
+ dataset_name = getattr(namespace, 'dataset_name', 'random')
1019
+ dataset_path = getattr(namespace, 'dataset_path', None)
1020
+
1021
+ # Validate the combination
1022
+ if dataset_name == "random" and dataset_path is not None:
1023
+ parser.error(
1024
+ "Cannot use 'random' dataset with --dataset-path. "
1025
+ "Please specify the appropriate --dataset-name (e.g., "
1026
+ "'sharegpt', 'custom', 'sonnet') for your dataset file: "
1027
+ f"{dataset_path}"
1028
+ )
1029
+
1030
+
1031
+ def add_dataset_parser(parser: FlexibleArgumentParser):
1032
+ parser.add_argument("--seed", type=int, default=0)
1033
+ parser.add_argument(
1034
+ "--num-prompts",
1035
+ type=int,
1036
+ default=1000,
1037
+ help="Number of prompts to process.",
1038
+ )
1039
+ parser.add_argument(
1040
+ "--dataset-name",
1041
+ type=str,
1042
+ default="random",
1043
+ action=_ValidateDatasetArgs,
1044
+ choices=[
1045
+ "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf",
1046
+ "custom", "prefix_repetition", "spec_bench"
1047
+ ],
1048
+ help="Name of the dataset to benchmark on.",
1049
+ )
1050
+ parser.add_argument(
1051
+ "--no-stream",
1052
+ action="store_true",
1053
+ help="Do not load the dataset in streaming mode.",
1054
+ )
1055
+ parser.add_argument(
1056
+ "--dataset-path",
1057
+ type=str,
1058
+ default=None,
1059
+ action=_ValidateDatasetArgs,
1060
+ help="Path to the sharegpt/sonnet dataset. "
1061
+ "Or the huggingface dataset ID if using HF dataset.",
1062
+ )
1063
+ parser.add_argument(
1064
+ "--no-oversample",
1065
+ action="store_true",
1066
+ help="Do not oversample if the dataset has " \
1067
+ "fewer samples than num-prompts.",
1068
+ )
1069
+
1070
+ # group for dataset specific arguments
1071
+ custom_group = parser.add_argument_group("custom dataset options")
1072
+ custom_group.add_argument(
1073
+ "--custom-output-len",
1074
+ type=int,
1075
+ default=256,
1076
+ help=
1077
+ "Number of output tokens per request, used only for custom dataset.",
1078
+ )
1079
+ custom_group.add_argument(
1080
+ "--custom-skip-chat-template",
1081
+ action="store_true",
1082
+ help=
1083
+ "Skip applying chat template to prompt, used only for custom dataset.",
1084
+ )
1085
+
1086
+ spec_bench_group = parser.add_argument_group("spec bench dataset options")
1087
+ spec_bench_group.add_argument(
1088
+ "--spec-bench-output-len",
1089
+ type=int,
1090
+ default=256,
1091
+ help=
1092
+ "Num of output tokens per request, used only for spec bench dataset.",
1093
+ )
1094
+ spec_bench_group.add_argument(
1095
+ "--spec-bench-category",
1096
+ type=str,
1097
+ default=None,
1098
+ help=
1099
+ "Category for spec bench dataset. If None, use all categories.",
1100
+ )
1101
+
1102
+ sonnet_group = parser.add_argument_group("sonnet dataset options")
1103
+ sonnet_group.add_argument(
1104
+ "--sonnet-input-len",
1105
+ type=int,
1106
+ default=550,
1107
+ help=
1108
+ "Number of input tokens per request, used only for sonnet dataset.",
1109
+ )
1110
+ sonnet_group.add_argument(
1111
+ "--sonnet-output-len",
1112
+ type=int,
1113
+ default=150,
1114
+ help=
1115
+ "Number of output tokens per request, used only for sonnet dataset.",
1116
+ )
1117
+ sonnet_group.add_argument(
1118
+ "--sonnet-prefix-len",
1119
+ type=int,
1120
+ default=200,
1121
+ help=
1122
+ "Number of prefix tokens per request, used only for sonnet dataset.",
1123
+ )
1124
+
1125
+ sharegpt_group = parser.add_argument_group("sharegpt dataset options")
1126
+ sharegpt_group.add_argument(
1127
+ "--sharegpt-output-len",
1128
+ type=int,
1129
+ default=None,
1130
+ help="Output length for each request. Overrides the output length "
1131
+ "from the ShareGPT dataset.",
1132
+ )
1133
+
1134
+ blazedit_group = parser.add_argument_group("blazedit dataset options")
1135
+ blazedit_group.add_argument(
1136
+ "--blazedit-min-distance",
1137
+ type=float,
1138
+ default=0.0,
1139
+ help=
1140
+ "Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
1141
+ )
1142
+ blazedit_group.add_argument(
1143
+ "--blazedit-max-distance",
1144
+ type=float,
1145
+ default=1.0,
1146
+ help=
1147
+ "Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
1148
+ )
1149
+
1150
+ random_group = parser.add_argument_group("random dataset options")
1151
+ random_group.add_argument(
1152
+ "--random-input-len",
1153
+ type=int,
1154
+ default=1024,
1155
+ help=
1156
+ "Number of input tokens per request, used only for random sampling.",
1157
+ )
1158
+ random_group.add_argument(
1159
+ "--random-output-len",
1160
+ type=int,
1161
+ default=128,
1162
+ help=
1163
+ "Number of output tokens per request, used only for random sampling.",
1164
+ )
1165
+ random_group.add_argument(
1166
+ "--random-range-ratio",
1167
+ type=float,
1168
+ default=0.0,
1169
+ help="Range ratio for sampling input/output length, "
1170
+ "used only for random sampling. Must be in the range [0, 1) to define "
1171
+ "a symmetric sampling range"
1172
+ "[length * (1 - range_ratio), length * (1 + range_ratio)].",
1173
+ )
1174
+ random_group.add_argument(
1175
+ "--random-prefix-len",
1176
+ type=int,
1177
+ default=0,
1178
+ help=("Number of fixed prefix tokens before the random context "
1179
+ "in a request. "
1180
+ "The total input length is the sum of `random-prefix-len` and "
1181
+ "a random "
1182
+ "context length sampled from [input_len * (1 - range_ratio), "
1183
+ "input_len * (1 + range_ratio)]."),
1184
+ )
1185
+ random_group.add_argument(
1186
+ "--random-batch-size",
1187
+ type=int,
1188
+ default=1,
1189
+ help=("Batch size for random sampling. "
1190
+ "Only used for embeddings benchmark."),
1191
+ )
1192
+
1193
+ # random multimodal dataset options
1194
+ random_mm_group = parser.add_argument_group(
1195
+ "random multimodal dataset options extended from random dataset")
1196
+ random_mm_group.add_argument(
1197
+ "--random-mm-base-items-per-request",
1198
+ type=int,
1199
+ default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST,
1200
+ help=(
1201
+ "Base number of multimodal items per request for random-mm. "
1202
+ "Actual per-request count is sampled around this base using "
1203
+ "--random-mm-num-mm-items-range-ratio."
1204
+ ),
1205
+ )
1206
+ random_mm_group.add_argument(
1207
+ "--random-mm-num-mm-items-range-ratio",
1208
+ type=float,
1209
+ default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
1210
+ help=(
1211
+ "Range ratio r in [0, 1] for sampling items per request. "
1212
+ "We sample uniformly from the closed integer range "
1213
+ "[floor(n*(1-r)), ceil(n*(1+r))] "
1214
+ "where n is the base items per request. "
1215
+ "r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped "
1216
+ "to the sum of per-modality limits from "
1217
+ "--random-mm-limit-mm-per-prompt. "
1218
+ "An error is raised if the computed min exceeds the max."
1219
+ ),
1220
+ )
1221
+ random_mm_group.add_argument(
1222
+ "--random-mm-limit-mm-per-prompt",
1223
+ type=json.loads,
1224
+ default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT,
1225
+ help=(
1226
+ "Per-modality hard caps for items attached per request, e.g. "
1227
+ "'{\"image\": 3, \"video\": 0}'. The sampled per-request item "
1228
+ "count is clamped to the sum of these limits. When a modality "
1229
+ "reaches its cap, its buckets are excluded and probabilities are "
1230
+ "renormalized."
1231
+ "OBS.: Only image sampling is supported for now."
1232
+ ),
1233
+ )
1234
+
1235
+ def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]:
1236
+ # If already a dict (e.g., programmatic call), normalize keys
1237
+ def normalize(d: dict) -> dict[tuple[int, int, int], float]:
1238
+ out: dict[tuple[int, int, int], float] = {}
1239
+ for k, val in d.items():
1240
+ key = k
1241
+ if isinstance(key, str):
1242
+ with suppress(Exception):
1243
+ key = ast.literal_eval(key)
1244
+ if not (isinstance(key, tuple) and len(key) == 3
1245
+ and all(isinstance(x, int) for x in key)):
1246
+ raise ValueError(
1247
+ f"Invalid bucket key {k!r}. Expected tuple (H, W, T)."
1248
+ )
1249
+ out[(int(key[0]), int(key[1]), int(key[2]))] = float(val)
1250
+ return out
1251
+
1252
+ if isinstance(v, dict):
1253
+ return normalize(v)
1254
+ if isinstance(v, str):
1255
+ # Python literal (supports tuple keys)
1256
+ parsed = ast.literal_eval(v)
1257
+ if not isinstance(parsed, dict):
1258
+ raise ValueError("Bucket config must parse to a dict.")
1259
+ return normalize(parsed)
1260
+ raise ValueError("Unsupported value for --random-mm-bucket-config.")
1261
+
1262
+ random_mm_group.add_argument(
1263
+ "--random-mm-bucket-config",
1264
+ type=_parse_mm_bucket_config,
1265
+ default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG,
1266
+ help=(
1267
+ "The bucket config is a dictionary mapping a multimodal item"
1268
+ "sampling configuration to a probability."
1269
+ "Currently allows for 2 modalities: images and videos. "
1270
+ "An bucket key is a tuple of (height, width, num_frames)"
1271
+ "The value is the probability of sampling that specific item. "
1272
+ "Example: "
1273
+ "--random-mm-bucket-config "
1274
+ "{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} "
1275
+ "First item: images with resolution 256x256 w.p. 0.5"
1276
+ "Second item: images with resolution 720x1280 w.p. 0.4 "
1277
+ "Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1"
1278
+ "OBS.: If the probabilities do not sum to 1, they are normalized."
1279
+ "OBS bis.: Only image sampling is supported for now."
1280
+ ),
1281
+ )
1282
+
1283
+ hf_group = parser.add_argument_group("hf dataset options")
1284
+ hf_group.add_argument("--hf-subset",
1285
+ type=str,
1286
+ default=None,
1287
+ help="Subset of the HF dataset.")
1288
+ hf_group.add_argument("--hf-split",
1289
+ type=str,
1290
+ default=None,
1291
+ help="Split of the HF dataset.")
1292
+ hf_group.add_argument(
1293
+ "--hf-name",
1294
+ type=str,
1295
+ default=None,
1296
+ help=(
1297
+ "Name of the dataset on HuggingFace "
1298
+ "(e.g., 'lmarena-ai/VisionArena-Chat'). "
1299
+ "Specify this if your dataset-path is a local path."
1300
+ ),
1301
+ )
1302
+ hf_group.add_argument(
1303
+ "--hf-output-len",
1304
+ type=int,
1305
+ default=None,
1306
+ help="Output length for each request. Overrides the output lengths "
1307
+ "from the sampled HF dataset.",
1308
+ )
1309
+
1310
+ prefix_repetition_group = parser.add_argument_group(
1311
+ "prefix repetition dataset options")
1312
+ prefix_repetition_group.add_argument(
1313
+ "--prefix-repetition-prefix-len",
1314
+ type=int,
1315
+ default=256,
1316
+ help="Number of prefix tokens per request, used only for prefix "
1317
+ "repetition dataset.",
1318
+ )
1319
+ prefix_repetition_group.add_argument(
1320
+ "--prefix-repetition-suffix-len",
1321
+ type=int,
1322
+ default=256,
1323
+ help="Number of suffix tokens per request, used only for prefix "
1324
+ "repetition dataset. Total input length is prefix_len + suffix_len.",
1325
+ )
1326
+ prefix_repetition_group.add_argument(
1327
+ "--prefix-repetition-num-prefixes",
1328
+ type=int,
1329
+ default=10,
1330
+ help="Number of prefixes to generate, used only for prefix repetition "
1331
+ "dataset. Prompts per prefix is num_requests // num_prefixes.",
1332
+ )
1333
+ prefix_repetition_group.add_argument(
1334
+ "--prefix-repetition-output-len",
1335
+ type=int,
1336
+ default=128,
1337
+ help="Number of output tokens per request, used only for prefix "
1338
+ "repetition dataset.",
1339
+ )
1340
+
1341
+
1342
+ def get_samples(args, tokenizer) -> list[SampleRequest]:
1343
+
1344
+ if not hasattr(args, "request_id_prefix"):
1345
+ args.request_id_prefix = ""
1346
+
1347
+ if args.dataset_name == "custom":
1348
+ dataset = CustomDataset(dataset_path=args.dataset_path)
1349
+ input_requests = dataset.sample(
1350
+ num_requests=args.num_prompts,
1351
+ tokenizer=tokenizer,
1352
+ output_len=args.custom_output_len,
1353
+ skip_chat_template=args.custom_skip_chat_template,
1354
+ request_id_prefix=args.request_id_prefix,
1355
+ no_oversample=args.no_oversample,
1356
+ )
1357
+
1358
+ elif args.dataset_name == "sonnet":
1359
+ dataset = SonnetDataset(dataset_path=args.dataset_path)
1360
+ # For the "sonnet" dataset, formatting depends on the backend.
1361
+ if args.backend == "openai-chat":
1362
+ input_requests = dataset.sample(
1363
+ num_requests=args.num_prompts,
1364
+ input_len=args.sonnet_input_len,
1365
+ output_len=args.sonnet_output_len,
1366
+ prefix_len=args.sonnet_prefix_len,
1367
+ tokenizer=tokenizer,
1368
+ return_prompt_formatted=False,
1369
+ request_id_prefix=args.request_id_prefix,
1370
+ no_oversample=args.no_oversample,
1371
+ )
1372
+ else:
1373
+ assert tokenizer.chat_template or tokenizer.default_chat_template, (
1374
+ "Tokenizer/model must have chat template for sonnet dataset.")
1375
+ input_requests = dataset.sample(
1376
+ num_requests=args.num_prompts,
1377
+ input_len=args.sonnet_input_len,
1378
+ output_len=args.sonnet_output_len,
1379
+ prefix_len=args.sonnet_prefix_len,
1380
+ tokenizer=tokenizer,
1381
+ return_prompt_formatted=True,
1382
+ request_id_prefix=args.request_id_prefix,
1383
+ no_oversample=args.no_oversample,
1384
+ )
1385
+
1386
+ elif args.dataset_name == "hf":
1387
+ # all following datasets are implemented from the
1388
+ # HuggingFaceDataset base class
1389
+ hf_kwargs = {}
1390
+ if (
1391
+ args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1392
+ or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1393
+ ):
1394
+ dataset_class = VisionArenaDataset
1395
+ args.hf_split = "train"
1396
+ args.hf_subset = None
1397
+ elif (
1398
+ args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS
1399
+ or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS
1400
+ ):
1401
+ dataset_class = MMVUDataset
1402
+ args.hf_split = "validation"
1403
+ args.hf_subset = None
1404
+ elif (
1405
+ args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1406
+ or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1407
+ ):
1408
+ dataset_class = InstructCoderDataset
1409
+ args.hf_split = "train"
1410
+ elif (
1411
+ args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
1412
+ or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
1413
+ ):
1414
+ dataset_class = MTBenchDataset
1415
+ args.hf_split = "train"
1416
+ elif (
1417
+ args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
1418
+ or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
1419
+ ):
1420
+ dataset_class = ConversationDataset
1421
+ elif (
1422
+ args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
1423
+ or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
1424
+ ):
1425
+ dataset_class = AIMODataset
1426
+ args.hf_split = "train"
1427
+ elif (
1428
+ args.dataset_path
1429
+ in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501
1430
+ or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
1431
+ ):
1432
+ dataset_class = NextEditPredictionDataset
1433
+ args.hf_split = "train"
1434
+ elif (
1435
+ args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
1436
+ or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
1437
+ ):
1438
+ dataset_class = ASRDataset
1439
+ args.hf_split = "train"
1440
+ elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS:
1441
+ dataset_class = BlazeditDataset
1442
+ args.hf_split = "train"
1443
+ hf_kwargs = {
1444
+ "min_distance": args.blazedit_min_distance,
1445
+ "max_distance": args.blazedit_max_distance,
1446
+ }
1447
+ elif (
1448
+ args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
1449
+ or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
1450
+ ):
1451
+ dataset_class = MLPerfDataset
1452
+ args.hf_split = "train"
1453
+ else:
1454
+ supported_datasets = set([
1455
+ dataset_name for cls in HuggingFaceDataset.__subclasses__()
1456
+ for dataset_name in cls.SUPPORTED_DATASET_PATHS
1457
+ ])
1458
+ raise ValueError(
1459
+ f"Unsupported dataset path: {args.dataset_path}. "
1460
+ "Huggingface dataset only supports dataset_path"
1461
+ f" from one of following: {supported_datasets}. "
1462
+ "Please consider contributing if you would "
1463
+ "like to add support for additional dataset formats.")
1464
+
1465
+ if dataset_class.IS_MULTIMODAL and args.backend not in [
1466
+ "openai-chat",
1467
+ "openai-audio",
1468
+ ]:
1469
+ # multi-modal benchmark is only available on OpenAI Chat
1470
+ # endpoint-type.
1471
+ raise ValueError(
1472
+ "Multi-modal content is only supported on 'openai-chat' and "
1473
+ "'openai-audio' backends.")
1474
+ input_requests = dataset_class(
1475
+ dataset_path=args.dataset_path,
1476
+ dataset_subset=args.hf_subset,
1477
+ dataset_split=args.hf_split,
1478
+ random_seed=args.seed,
1479
+ no_stream=args.no_stream,
1480
+ hf_name=args.hf_name,
1481
+ ).sample(
1482
+ num_requests=args.num_prompts,
1483
+ tokenizer=tokenizer,
1484
+ output_len=args.hf_output_len,
1485
+ request_id_prefix=args.request_id_prefix,
1486
+ no_oversample=args.no_oversample,
1487
+ **hf_kwargs
1488
+ )
1489
+
1490
+ else:
1491
+ # For datasets that follow a similar structure, use a mapping.
1492
+ dataset_mapping = {
1493
+ "spec_bench":
1494
+ lambda: SpecBench(dataset_path=args.dataset_path,
1495
+ category=args.spec_bench_category).sample(
1496
+ num_requests=args.num_prompts,
1497
+ tokenizer=tokenizer,
1498
+ output_len=args.spec_bench_output_len,
1499
+ request_id_prefix=args.request_id_prefix,
1500
+ no_oversample=args.no_oversample,
1501
+ ),
1502
+ "sharegpt": lambda: ShareGPTDataset(
1503
+ random_seed=args.seed, dataset_path=args.dataset_path
1504
+ ).sample(
1505
+ tokenizer=tokenizer,
1506
+ num_requests=args.num_prompts,
1507
+ output_len=args.sharegpt_output_len,
1508
+ request_id_prefix=args.request_id_prefix,
1509
+ no_oversample=args.no_oversample,
1510
+ ),
1511
+ "burstgpt": lambda: BurstGPTDataset(
1512
+ random_seed=args.seed, dataset_path=args.dataset_path
1513
+ ).sample(
1514
+ tokenizer=tokenizer,
1515
+ num_requests=args.num_prompts,
1516
+ request_id_prefix=args.request_id_prefix,
1517
+ no_oversample=args.no_oversample,
1518
+ ),
1519
+ "random": lambda: RandomDataset(
1520
+ random_seed=args.seed, dataset_path=args.dataset_path
1521
+ ).sample(
1522
+ tokenizer=tokenizer,
1523
+ num_requests=args.num_prompts,
1524
+ prefix_len=args.random_prefix_len,
1525
+ input_len=args.random_input_len,
1526
+ output_len=args.random_output_len,
1527
+ range_ratio=args.random_range_ratio,
1528
+ request_id_prefix=args.request_id_prefix,
1529
+ batchsize=args.random_batch_size,
1530
+ no_oversample=args.no_oversample,
1531
+ ),
1532
+ "random-mm":
1533
+ lambda: RandomMultiModalDataset(
1534
+ random_seed=args.seed, dataset_path=args.dataset_path
1535
+ ).sample(
1536
+ tokenizer=tokenizer,
1537
+ num_requests=args.num_prompts,
1538
+ prefix_len=args.random_prefix_len,
1539
+ range_ratio=args.random_range_ratio,
1540
+ input_len=args.random_input_len,
1541
+ output_len=args.random_output_len,
1542
+ base_items_per_request=args.random_mm_base_items_per_request,
1543
+ limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
1544
+ num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
1545
+ bucket_config=args.random_mm_bucket_config,
1546
+ request_id_prefix=args.request_id_prefix,
1547
+ no_oversample=args.no_oversample,
1548
+ ),
1549
+ "prefix_repetition":
1550
+ lambda: PrefixRepetitionRandomDataset(
1551
+ random_seed=args.seed, dataset_path=args.dataset_path
1552
+ ).sample(
1553
+ tokenizer=tokenizer,
1554
+ num_requests=args.num_prompts,
1555
+ prefix_len=args.prefix_repetition_prefix_len,
1556
+ suffix_len=args.prefix_repetition_suffix_len,
1557
+ num_prefixes=args.prefix_repetition_num_prefixes,
1558
+ output_len=args.prefix_repetition_output_len,
1559
+ request_id_prefix=args.request_id_prefix,
1560
+ no_oversample=args.no_oversample,
1561
+ ),
1562
+ }
1563
+
1564
+ try:
1565
+ # Enforce endpoint compatibility for multimodal datasets.
1566
+ if args.dataset_name == "random-mm" and args.backend not in [
1567
+ "openai-chat"]:
1568
+ raise ValueError(
1569
+ "Multi-modal content (images) is only supported on "
1570
+ "'openai-chat' backend."
1571
+ )
1572
+ input_requests = dataset_mapping[args.dataset_name]()
1573
+ except KeyError as err:
1574
+ raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
1575
+
1576
+ return input_requests
1577
+
1578
+
1579
+ # -----------------------------------------------------------------------------
1580
+ # Custom Dataset Implementation
1581
+ # -----------------------------------------------------------------------------
1582
+
1583
+
1584
+ class CustomDataset(BenchmarkDataset):
1585
+ """
1586
+ Implements the Custom dataset. Loads data from a JSONL file and generates
1587
+ sample requests based on conversation turns. E.g.,
1588
+ ```
1589
+ {"prompt": "What is the capital of India?"}
1590
+ {"prompt": "What is the capital of Iran?"}
1591
+ {"prompt": "What is the capital of China?"}
1592
+ ```
1593
+ """
1594
+
1595
+ def __init__(self, **kwargs) -> None:
1596
+ super().__init__(**kwargs)
1597
+ self.load_data()
1598
+
1599
+ def load_data(self) -> None:
1600
+ if self.dataset_path is None:
1601
+ raise ValueError("dataset_path must be provided for loading data.")
1602
+
1603
+ # self.data will be a list of dictionaries
1604
+ # e.g., [{"prompt": "What is the capital of India?"}, ...]
1605
+ # This will be the standardized format which load_data()
1606
+ # has to convert into depending on the filetype of dataset_path.
1607
+ # sample() will assume this standardized format of self.data
1608
+ self.data = []
1609
+
1610
+ # Load the JSONL file
1611
+ if self.dataset_path.endswith(".jsonl"):
1612
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
1613
+ lines=True)
1614
+
1615
+ # check if the JSONL file has a 'prompt' column
1616
+ if "prompt" not in jsonl_data.columns:
1617
+ raise ValueError("JSONL file must contain a 'prompt' column.")
1618
+
1619
+ # Convert each row to a dictionary and append to self.data
1620
+ # This will convert the DataFrame to a list of dictionaries
1621
+ # where each dictionary corresponds to a row in the DataFrame.
1622
+ # This is the standardized format we want for self.data
1623
+ for _, row in jsonl_data.iterrows():
1624
+ self.data.append(row.to_dict())
1625
+ else:
1626
+ raise NotImplementedError(
1627
+ "Only JSONL format is supported for CustomDataset.")
1628
+
1629
+ random.seed(self.random_seed)
1630
+ random.shuffle(self.data)
1631
+
1632
+ def sample(
1633
+ self,
1634
+ tokenizer: PreTrainedTokenizerBase,
1635
+ num_requests: int,
1636
+ lora_path: Optional[str] = None,
1637
+ max_loras: Optional[int] = None,
1638
+ output_len: Optional[int] = None,
1639
+ enable_multimodal_chat: bool = False,
1640
+ skip_chat_template: bool = False,
1641
+ request_id_prefix: str = "",
1642
+ no_oversample: bool = False,
1643
+ **kwargs,
1644
+ ) -> list:
1645
+ # load all data if needed
1646
+ self.num_available_samples = len(self.data)
1647
+ if num_requests <= 0:
1648
+ num_requests = self.num_available_samples
1649
+ logger.info("num_requests is set to 0 or negative, "
1650
+ "so using all available samples: %d",
1651
+ num_requests)
1652
+
1653
+ sampled_requests = []
1654
+ for i, item in enumerate(self.data):
1655
+ if len(sampled_requests) >= num_requests:
1656
+ break
1657
+ prompt = item["prompt"]
1658
+
1659
+ # apply template
1660
+ if not skip_chat_template:
1661
+ prompt = tokenizer.apply_chat_template(
1662
+ [{
1663
+ "role": "user",
1664
+ "content": prompt
1665
+ }],
1666
+ add_generation_prompt=True,
1667
+ tokenize=False,
1668
+ )
1669
+
1670
+ prompt_len = len(tokenizer(prompt).input_ids)
1671
+ sampled_requests.append(
1672
+ SampleRequest(
1673
+ prompt=prompt,
1674
+ prompt_len=prompt_len,
1675
+ expected_output_len=output_len,
1676
+ request_id=request_id_prefix + str(i),
1677
+ ))
1678
+ self.maybe_oversample_requests(sampled_requests, num_requests,
1679
+ request_id_prefix, no_oversample)
1680
+
1681
+ return sampled_requests
1682
+
1683
+
1684
+ # -----------------------------------------------------------------------------
1685
+ # Spec Bench Dataset Implementation
1686
+ # -----------------------------------------------------------------------------
1687
+
1688
+
1689
+ class SpecBench(CustomDataset):
1690
+ """
1691
+ Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
1692
+ Download the dataset using:
1693
+ wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
1694
+ """ # noqa: E501
1695
+
1696
+ def __init__(self, **kwargs) -> None:
1697
+ self.category = kwargs.pop("category", None)
1698
+ super().__init__(**kwargs)
1699
+ self.load_data()
1700
+
1701
+ def load_data(self) -> None:
1702
+ if self.dataset_path is None:
1703
+ raise ValueError("dataset_path must be provided for loading data.")
1704
+
1705
+ self.data = []
1706
+
1707
+ # Load the JSONL file
1708
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
1709
+ lines=True)
1710
+
1711
+ # check if the JSONL file has a 'turns' column
1712
+ if "turns" not in jsonl_data.columns:
1713
+ raise ValueError("JSONL file must contain a 'turns' column.")
1714
+
1715
+ for _, row in jsonl_data.iterrows():
1716
+ # sample only from a specific category if specified
1717
+ if (not self.category) or (self.category == row['category']):
1718
+ prompt = row["turns"][0]
1719
+ self.data.append({"prompt": prompt})
1720
+
1721
+ random.seed(self.random_seed)
1722
+ random.shuffle(self.data)
1723
+
1724
+ def sample(self, **kwargs) -> list:
1725
+ # leverage CustomDataset sample
1726
+ kwargs["skip_chat_template"] = False
1727
+ return super().sample(**kwargs)
1728
+
1729
+
1730
+ # -----------------------------------------------------------------------------
1731
+ # Sonnet Dataset Implementation
1732
+ # -----------------------------------------------------------------------------
1733
+
1734
+ @deprecated(
1735
+ "SonnetDataset is deprecated and will be removed in a future version.",
1736
+ )
1737
+ class SonnetDataset(BenchmarkDataset):
1738
+ """
1739
+ Simplified implementation of the Sonnet dataset. Loads poem lines from a
1740
+ text file and generates sample requests. Default values here copied from
1741
+ `benchmark_serving.py` for the sonnet dataset.
1742
+ """
1743
+
1744
+ DEFAULT_PREFIX_LEN = 200
1745
+ DEFAULT_INPUT_LEN = 550
1746
+ DEFAULT_OUTPUT_LEN = 150
1747
+
1748
+ def __init__(
1749
+ self,
1750
+ **kwargs,
1751
+ ) -> None:
1752
+ super().__init__(**kwargs)
1753
+ self.load_data()
1754
+
1755
+ def load_data(self) -> None:
1756
+ if not self.dataset_path:
1757
+ raise ValueError("dataset_path must be provided.")
1758
+ with open(self.dataset_path, encoding="utf-8") as f:
1759
+ self.data = f.readlines()
1760
+
1761
+ def sample(
1762
+ self,
1763
+ tokenizer,
1764
+ num_requests: int,
1765
+ prefix_len: int = DEFAULT_PREFIX_LEN,
1766
+ input_len: int = DEFAULT_INPUT_LEN,
1767
+ output_len: int = DEFAULT_OUTPUT_LEN,
1768
+ return_prompt_formatted: bool = False,
1769
+ request_id_prefix: str = "",
1770
+ no_oversample: bool = False,
1771
+ **kwargs,
1772
+ ) -> list:
1773
+ # Calculate average token length for a poem line.
1774
+ tokenized_lines = [tokenizer(line).input_ids for line in self.data]
1775
+ avg_len = sum(len(tokens)
1776
+ for tokens in tokenized_lines) / len(tokenized_lines)
1777
+
1778
+ # Build the base prompt.
1779
+ base_prompt = "Pick as many lines as you can from these poem lines:\n"
1780
+ base_msg = [{"role": "user", "content": base_prompt}]
1781
+ base_fmt = tokenizer.apply_chat_template(base_msg,
1782
+ add_generation_prompt=True,
1783
+ tokenize=False)
1784
+ base_offset = len(tokenizer(base_fmt).input_ids)
1785
+ if input_len <= base_offset:
1786
+ raise ValueError(
1787
+ f"'input_len' must be higher than the base prompt length "
1788
+ f"({base_offset}).")
1789
+
1790
+ # Determine how many poem lines to use.
1791
+ num_input_lines = round((input_len - base_offset) / avg_len)
1792
+ num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
1793
+ prefix_lines = self.data[:num_prefix_lines]
1794
+
1795
+ samples = []
1796
+ ind = 0
1797
+ while len(samples) < num_requests:
1798
+ extra_lines = random.choices(self.data,
1799
+ k=num_input_lines - num_prefix_lines)
1800
+ prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
1801
+ msg = [{"role": "user", "content": prompt}]
1802
+ prompt_formatted = tokenizer.apply_chat_template(
1803
+ msg, add_generation_prompt=True, tokenize=False)
1804
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
1805
+ if prompt_len <= input_len:
1806
+ samples.append(
1807
+ SampleRequest(
1808
+ prompt=prompt_formatted
1809
+ if return_prompt_formatted else prompt,
1810
+ prompt_len=prompt_len,
1811
+ expected_output_len=output_len,
1812
+ request_id=request_id_prefix + str(ind),
1813
+ ))
1814
+ ind += 1
1815
+ return samples
1816
+
1817
+
1818
+ # -----------------------------------------------------------------------------
1819
+ # BurstGPT Dataset Implementation
1820
+ # -----------------------------------------------------------------------------
1821
+
1822
+
1823
+ class BurstGPTDataset(BenchmarkDataset):
1824
+ """
1825
+ Implements the BurstGPT dataset. Loads data from a CSV file and generates
1826
+ sample requests based on synthetic prompt generation. Only rows with Model
1827
+ "GPT-4" and positive response tokens are used.
1828
+ """
1829
+
1830
+ def __init__(self, **kwargs) -> None:
1831
+ super().__init__(**kwargs)
1832
+ self.load_data()
1833
+
1834
+ def load_data(self, ):
1835
+ if self.dataset_path is None:
1836
+ raise ValueError("dataset_path must be provided for loading data.")
1837
+
1838
+ df = pd.read_csv(self.dataset_path)
1839
+ # Filter to keep only GPT-4 rows.
1840
+ gpt4_df = df[df["Model"] == "GPT-4"]
1841
+ # Remove failed requests (where Response tokens is 0 or less).
1842
+ gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
1843
+ # Sample the desired number of rows.
1844
+ self.data = gpt4_df
1845
+
1846
+ def _sample_loaded_data(self, num_requests: int) -> list:
1847
+ if num_requests <= len(self.data):
1848
+ data = self.data.sample(n=num_requests,
1849
+ random_state=self.random_seed)
1850
+ else:
1851
+ data = self.data.sample(
1852
+ n=num_requests,
1853
+ random_state=self.random_seed,
1854
+ replace=True,
1855
+ )
1856
+ # Convert the dataframe to a list of lists.
1857
+ return data.values.tolist()
1858
+
1859
+ def sample(
1860
+ self,
1861
+ tokenizer: PreTrainedTokenizerBase,
1862
+ num_requests: int,
1863
+ max_loras: Optional[int] = None,
1864
+ lora_path: Optional[str] = None,
1865
+ request_id_prefix: str = "",
1866
+ no_oversample: bool = False,
1867
+ **kwargs,
1868
+ ) -> list[SampleRequest]:
1869
+ samples = []
1870
+ data = self._sample_loaded_data(num_requests=num_requests)
1871
+ for i in range(num_requests):
1872
+ input_len = int(data[i][2])
1873
+ output_len = int(data[i][3])
1874
+ lora_req = self.get_random_lora_request(
1875
+ max_loras=max_loras, lora_path=lora_path)
1876
+ vocab_size = tokenizer.vocab_size
1877
+ # Generate a synthetic prompt: a list of token IDs computed as (i +
1878
+ # j) modulo vocab_size.
1879
+ token_ids = [(i + j) % vocab_size for j in range(input_len)]
1880
+ prompt = tokenizer.decode(token_ids)
1881
+ samples.append(
1882
+ SampleRequest(
1883
+ prompt=prompt,
1884
+ prompt_len=input_len,
1885
+ expected_output_len=output_len,
1886
+ lora_request=lora_req,
1887
+ request_id=request_id_prefix + str(i),
1888
+ ))
1889
+ return samples
1890
+
1891
+
1892
+ # -----------------------------------------------------------------------------
1893
+ # HuggingFace Dataset Base Implementation
1894
+ # -----------------------------------------------------------------------------
1895
+ class HuggingFaceDataset(BenchmarkDataset):
1896
+ """Base class for datasets hosted on HuggingFace."""
1897
+
1898
+ SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
1899
+
1900
+ def __init__(
1901
+ self,
1902
+ dataset_path: str,
1903
+ dataset_split: str,
1904
+ no_stream: bool = False,
1905
+ dataset_subset: Optional[str] = None,
1906
+ hf_name: Optional[str] = None,
1907
+ **kwargs,
1908
+ ) -> None:
1909
+ super().__init__(dataset_path=dataset_path, **kwargs)
1910
+
1911
+ self.dataset_split = dataset_split
1912
+ self.dataset_subset = dataset_subset
1913
+ self.load_stream = not no_stream
1914
+ self.hf_name = hf_name or dataset_path
1915
+ self.load_data()
1916
+
1917
+ def load_data(self) -> None:
1918
+ """Load data from HuggingFace datasets."""
1919
+ self.data = load_dataset(
1920
+ self.dataset_path,
1921
+ name=self.dataset_subset,
1922
+ split=self.dataset_split,
1923
+ streaming=self.load_stream,
1924
+ )
1925
+ self.data = self.data.shuffle(seed=self.random_seed)
1926
+
1927
+
1928
+ # -----------------------------------------------------------------------------
1929
+ # Conversation Dataset Implementation
1930
+ # -----------------------------------------------------------------------------
1931
+
1932
+
1933
+ class ConversationDataset(HuggingFaceDataset):
1934
+ """Dataset for conversation data with multimodal support."""
1935
+ SUPPORTED_DATASET_PATHS = {
1936
+ 'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
1937
+ }
1938
+ IS_MULTIMODAL = True
1939
+
1940
+ def sample(self,
1941
+ tokenizer: PreTrainedTokenizerBase,
1942
+ num_requests: int,
1943
+ output_len: Optional[int] = None,
1944
+ enable_multimodal_chat: bool = False,
1945
+ request_id_prefix: str = "",
1946
+ no_oversample: bool = False,
1947
+ **kwargs) -> list:
1948
+ # Filter examples with at least 2 conversations
1949
+ filtered_data = self.data.filter(
1950
+ lambda x: len(x["conversations"]) >= 2)
1951
+ sampled_requests = []
1952
+ ind = 0
1953
+ dynamic_output = output_len is None
1954
+
1955
+ for item in filtered_data:
1956
+ if len(sampled_requests) >= num_requests:
1957
+ break
1958
+ conv = item["conversations"]
1959
+ prompt, completion = conv[0]["value"], conv[1]["value"]
1960
+
1961
+ prompt_ids = tokenizer(prompt).input_ids
1962
+ completion_ids = tokenizer(completion).input_ids
1963
+ prompt_len = len(prompt_ids)
1964
+ completion_len = len(completion_ids)
1965
+ output_len = completion_len if dynamic_output else output_len
1966
+ assert isinstance(output_len, int) and output_len > 0
1967
+ if dynamic_output and not is_valid_sequence(
1968
+ prompt_len, completion_len):
1969
+ continue
1970
+ mm_content = process_image(
1971
+ item["image"]) if "image" in item else None
1972
+ if enable_multimodal_chat:
1973
+ # Note: when chat is enabled the request prompt_len is no longer
1974
+ # accurate and we will be using request output to count the
1975
+ # actual prompt len and output len
1976
+ prompt = self.apply_multimodal_chat_transformation(
1977
+ prompt, mm_content)
1978
+ sampled_requests.append(
1979
+ SampleRequest(
1980
+ prompt=prompt,
1981
+ prompt_len=prompt_len,
1982
+ expected_output_len=output_len,
1983
+ multi_modal_data=mm_content,
1984
+ request_id=request_id_prefix + str(ind),
1985
+ ))
1986
+ ind += 1
1987
+ self.maybe_oversample_requests(sampled_requests, num_requests,
1988
+ request_id_prefix, no_oversample)
1989
+ return sampled_requests
1990
+
1991
+
1992
+ # -----------------------------------------------------------------------------
1993
+ # Vision Arena Dataset Implementation
1994
+ # -----------------------------------------------------------------------------
1995
+
1996
+
1997
+ class VisionArenaDataset(HuggingFaceDataset):
1998
+ """
1999
+ Vision Arena Dataset.
2000
+ """
2001
+
2002
+ DEFAULT_OUTPUT_LEN = 128
2003
+ SUPPORTED_DATASET_PATHS = {
2004
+ "lmarena-ai/VisionArena-Chat":
2005
+ lambda x: x["conversation"][0][0]["content"],
2006
+ "lmarena-ai/vision-arena-bench-v0.1":
2007
+ lambda x: x["turns"][0][0]["content"]
2008
+ }
2009
+ IS_MULTIMODAL = True
2010
+
2011
+ def sample(
2012
+ self,
2013
+ tokenizer: PreTrainedTokenizerBase,
2014
+ num_requests: int,
2015
+ output_len: Optional[int] = None,
2016
+ enable_multimodal_chat: bool = False,
2017
+ request_id_prefix: str = "",
2018
+ no_oversample: bool = False,
2019
+ **kwargs,
2020
+ ) -> list:
2021
+ output_len = (output_len
2022
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2023
+ sampled_requests = []
2024
+ for i, item in enumerate(self.data):
2025
+ if len(sampled_requests) >= num_requests:
2026
+ break
2027
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2028
+ if parser_fn is None:
2029
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2030
+ prompt = parser_fn(item)
2031
+ mm_content = process_image(item["images"][0])
2032
+ prompt_len = len(tokenizer(prompt).input_ids)
2033
+ if enable_multimodal_chat:
2034
+ # Note: when chat is enabled the request prompt_len is no longer
2035
+ # accurate and we will be using request output to count the
2036
+ # actual prompt len
2037
+ prompt = self.apply_multimodal_chat_transformation(
2038
+ prompt, mm_content)
2039
+ sampled_requests.append(
2040
+ SampleRequest(
2041
+ prompt=prompt,
2042
+ prompt_len=prompt_len,
2043
+ expected_output_len=output_len,
2044
+ multi_modal_data=mm_content,
2045
+ request_id=request_id_prefix + str(i),
2046
+ ))
2047
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2048
+ request_id_prefix, no_oversample)
2049
+ return sampled_requests
2050
+
2051
+
2052
+ class MMVUDataset(HuggingFaceDataset):
2053
+ """
2054
+ MMVU Dataset.
2055
+ https://huggingface.co/datasets/yale-nlp/MMVU
2056
+ """
2057
+
2058
+ DEFAULT_OUTPUT_LEN = 128
2059
+ SUPPORTED_DATASET_PATHS = {
2060
+ "yale-nlp/MMVU":
2061
+ lambda x: x["question"] + " " + (
2062
+ " ".join(f"{k}.{v}" for k, v in x["choices"].items())
2063
+ ),
2064
+ }
2065
+
2066
+ def sample(
2067
+ self,
2068
+ tokenizer: PreTrainedTokenizerBase,
2069
+ num_requests: int,
2070
+ output_len: Optional[int] = None,
2071
+ enable_multimodal_chat: bool = False,
2072
+ request_id_prefix: str = "",
2073
+ no_oversample: bool = False,
2074
+ **kwargs,
2075
+ ) -> list:
2076
+ output_len = (output_len
2077
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2078
+ sampled_requests = []
2079
+ for i, item in enumerate(self.data):
2080
+ if len(sampled_requests) >= num_requests:
2081
+ break
2082
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2083
+ if parser_fn is None:
2084
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2085
+ prompt = parser_fn(item)
2086
+ mm_content = process_video(item["video"])
2087
+ prompt_len = len(tokenizer(prompt).input_ids)
2088
+ if enable_multimodal_chat:
2089
+ # Note: when chat is enabled the request prompt_len is no longer
2090
+ # accurate and we will be using request output to count the
2091
+ # actual prompt len
2092
+ prompt = self.apply_multimodal_chat_transformation(
2093
+ prompt, mm_content)
2094
+ sampled_requests.append(
2095
+ SampleRequest(
2096
+ prompt=prompt,
2097
+ prompt_len=prompt_len,
2098
+ expected_output_len=output_len,
2099
+ multi_modal_data=mm_content,
2100
+ request_id=request_id_prefix + str(i),
2101
+ ))
2102
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2103
+ request_id_prefix, no_oversample)
2104
+ return sampled_requests
2105
+
2106
+
2107
+ # -----------------------------------------------------------------------------
2108
+ # Instruct Coder Dataset Implementation
2109
+ # -----------------------------------------------------------------------------
2110
+
2111
+
2112
+ class InstructCoderDataset(HuggingFaceDataset):
2113
+ """
2114
+ InstructCoder Dataset.
2115
+ https://huggingface.co/datasets/likaixin/InstructCoder
2116
+
2117
+ InstructCoder is the dataset designed for general code editing. It consists
2118
+ of 114,239 instruction-input-output triplets, and covers multiple distinct
2119
+ code editing scenario.
2120
+ """
2121
+
2122
+ DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
2123
+ SUPPORTED_DATASET_PATHS = {
2124
+ "likaixin/InstructCoder",
2125
+ }
2126
+
2127
+ def sample(self,
2128
+ tokenizer: PreTrainedTokenizerBase,
2129
+ num_requests: int,
2130
+ output_len: Optional[int] = None,
2131
+ enable_multimodal_chat: bool = False,
2132
+ request_id_prefix: str = "",
2133
+ no_oversample: bool = False,
2134
+ **kwargs) -> list:
2135
+ output_len = (output_len
2136
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2137
+ sampled_requests = []
2138
+ for i, item in enumerate(self.data):
2139
+ if len(sampled_requests) >= num_requests:
2140
+ break
2141
+ prompt = (
2142
+ f"{item['input']}\n\n{item['instruction']} Just output "
2143
+ "the code, do not include any explanation."
2144
+ )
2145
+
2146
+ # apply template
2147
+ prompt = tokenizer.apply_chat_template(
2148
+ [{
2149
+ "role": "user",
2150
+ "content": prompt
2151
+ }],
2152
+ add_generation_prompt=True,
2153
+ tokenize=False,
2154
+ )
2155
+
2156
+ prompt_len = len(tokenizer(prompt).input_ids)
2157
+ sampled_requests.append(
2158
+ SampleRequest(
2159
+ prompt=prompt,
2160
+ prompt_len=prompt_len,
2161
+ expected_output_len=output_len,
2162
+ request_id=request_id_prefix + str(i),
2163
+ ))
2164
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2165
+ request_id_prefix, no_oversample)
2166
+ return sampled_requests
2167
+
2168
+
2169
+ # -----------------------------------------------------------------------------
2170
+ # MT-Bench Dataset Implementation
2171
+ # -----------------------------------------------------------------------------
2172
+
2173
+
2174
+ class MTBenchDataset(HuggingFaceDataset):
2175
+ """
2176
+ MT-Bench Dataset.
2177
+ https://huggingface.co/datasets/philschmid/mt-bench
2178
+
2179
+ We create a single turn dataset for MT-Bench.
2180
+ This is similar to Spec decoding benchmark setup in vLLM
2181
+ https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
2182
+ """ # noqa: E501
2183
+
2184
+ DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
2185
+ SUPPORTED_DATASET_PATHS = {
2186
+ "philschmid/mt-bench",
2187
+ }
2188
+
2189
+ def sample(
2190
+ self,
2191
+ tokenizer: PreTrainedTokenizerBase,
2192
+ num_requests: int,
2193
+ output_len: Optional[int] = None,
2194
+ enable_multimodal_chat: bool = False,
2195
+ request_id_prefix: str = "",
2196
+ no_oversample: bool = False,
2197
+ **kwargs,
2198
+ ) -> list:
2199
+ output_len = (output_len
2200
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2201
+ sampled_requests = []
2202
+
2203
+ for i, item in enumerate(self.data):
2204
+ if len(sampled_requests) >= num_requests:
2205
+ break
2206
+ prompt = item["turns"][0]
2207
+
2208
+ # apply template
2209
+ prompt = tokenizer.apply_chat_template(
2210
+ [{
2211
+ "role": "user",
2212
+ "content": prompt
2213
+ }],
2214
+ add_generation_prompt=True,
2215
+ tokenize=False,
2216
+ )
2217
+
2218
+ prompt_len = len(tokenizer(prompt).input_ids)
2219
+ sampled_requests.append(
2220
+ SampleRequest(
2221
+ prompt=prompt,
2222
+ prompt_len=prompt_len,
2223
+ expected_output_len=output_len,
2224
+ request_id=request_id_prefix + str(i),
2225
+ ))
2226
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2227
+ request_id_prefix, no_oversample)
2228
+ return sampled_requests
2229
+
2230
+
2231
+ # -----------------------------------------------------------------------------
2232
+ # Blazedit Dataset Implementation
2233
+ # -----------------------------------------------------------------------------
2234
+
2235
+
2236
+ class BlazeditDataset(HuggingFaceDataset):
2237
+ """
2238
+ Blazedit Dataset.
2239
+ https://github.com/ise-uiuc/blazedit
2240
+
2241
+ 5k char version: vdaita/edit_5k_char
2242
+ 10k char version: vdaita/edit_10k_char
2243
+ """ # noqa: E501
2244
+
2245
+ # 5k char version will have output as ~5k chars
2246
+ # 10k char version will have output as ~10k chars
2247
+ # Assuming 3 char per token, 10k chars will be 3333 tokens
2248
+ # We set default to 4000 to be safe
2249
+ DEFAULT_OUTPUT_LEN = 4000
2250
+ SUPPORTED_DATASET_PATHS = {
2251
+ "vdaita/edit_5k_char",
2252
+ "vdaita/edit_10k_char",
2253
+ }
2254
+
2255
+ def sample(
2256
+ self,
2257
+ tokenizer: PreTrainedTokenizerBase,
2258
+ num_requests: int,
2259
+ output_len: Optional[int] = None,
2260
+ request_id_prefix: str = "",
2261
+ no_oversample: bool = False,
2262
+ min_distance: float = 0.0,
2263
+ max_distance: float = 1.0,
2264
+ **kwargs,
2265
+ ) -> list:
2266
+ output_len = (output_len
2267
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2268
+ sampled_requests = []
2269
+
2270
+ for i, item in enumerate(self.data):
2271
+ if len(sampled_requests) >= num_requests:
2272
+ break
2273
+ code = item["code"]
2274
+ change_request = item["change_request"]
2275
+ norm_distance = item["norm_distance"]
2276
+
2277
+ # compare the levenshtein distance normalized by code length
2278
+ if norm_distance < min_distance or norm_distance > max_distance:
2279
+ continue
2280
+
2281
+ # template copied from
2282
+ # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
2283
+ instruction = f"""Given a code file, please apply the change requests and generate the new file.
2284
+
2285
+ Original file:
2286
+ ```python
2287
+ {code}
2288
+ ```
2289
+
2290
+ Change request:
2291
+ {change_request}
2292
+
2293
+ Please generate the new code file in the "New file" section below.""" # noqa: E501
2294
+
2295
+ # apply template
2296
+ prompt = tokenizer.apply_chat_template(
2297
+ [{
2298
+ "role": "user",
2299
+ "content": instruction
2300
+ }],
2301
+ add_generation_prompt=True,
2302
+ tokenize=False,
2303
+ )
2304
+
2305
+ prompt_len = len(tokenizer(prompt).input_ids)
2306
+
2307
+ sampled_requests.append(
2308
+ SampleRequest(
2309
+ prompt=prompt,
2310
+ prompt_len=prompt_len,
2311
+ expected_output_len=output_len,
2312
+ request_id=request_id_prefix + str(i),
2313
+ ))
2314
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2315
+ request_id_prefix, no_oversample)
2316
+
2317
+ return sampled_requests
2318
+
2319
+
2320
+ # -----------------------------------------------------------------------------
2321
+ # AIMO Dataset Implementation
2322
+ # -----------------------------------------------------------------------------
2323
+
2324
+
2325
+ class AIMODataset(HuggingFaceDataset):
2326
+ """
2327
+ Dataset class for processing a AIMO dataset with reasoning questions.
2328
+ """
2329
+ SUPPORTED_DATASET_PATHS = {
2330
+ "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
2331
+ "AI-MO/NuminaMath-CoT"
2332
+ }
2333
+
2334
+ def sample(self,
2335
+ tokenizer: PreTrainedTokenizerBase,
2336
+ num_requests: int,
2337
+ output_len: Optional[int] = None,
2338
+ request_id_prefix: str = "",
2339
+ no_oversample: bool = False,
2340
+ **kwargs) -> list:
2341
+ sampled_requests = []
2342
+ ind = 0
2343
+ dynamic_output = output_len is None
2344
+
2345
+ for item in self.data:
2346
+ if len(sampled_requests) >= num_requests:
2347
+ break
2348
+ prompt, completion = item['problem'], item["solution"]
2349
+
2350
+ prompt_ids = tokenizer(prompt).input_ids
2351
+ completion_ids = tokenizer(completion).input_ids
2352
+ prompt_len = len(prompt_ids)
2353
+ completion_len = len(completion_ids)
2354
+ output_len = completion_len if dynamic_output else output_len
2355
+ assert isinstance(output_len, int) and output_len > 0
2356
+ if dynamic_output and not is_valid_sequence(prompt_len,
2357
+ completion_len,
2358
+ max_prompt_len=2048,
2359
+ max_total_len=32000):
2360
+ continue
2361
+ sampled_requests.append(
2362
+ SampleRequest(
2363
+ prompt=prompt,
2364
+ prompt_len=prompt_len,
2365
+ expected_output_len=output_len,
2366
+ multi_modal_data=None,
2367
+ request_id=request_id_prefix + str(ind),
2368
+ ))
2369
+ ind += 1
2370
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2371
+ request_id_prefix, no_oversample)
2372
+ return sampled_requests
2373
+
2374
+
2375
+ # -----------------------------------------------------------------------------
2376
+ # Next Edit Prediction Dataset Implementation
2377
+ # -----------------------------------------------------------------------------
2378
+
2379
+
2380
+ zeta_prompt = """### Instruction:
2381
+ You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
2382
+
2383
+ ### User Edits:
2384
+
2385
+ {}
2386
+
2387
+ ### User Excerpt:
2388
+
2389
+ {}
2390
+
2391
+ ### Response:
2392
+
2393
+ """ # noqa: E501
2394
+
2395
+
2396
+ def _format_zeta_prompt(
2397
+ sample: dict,
2398
+ original_start_marker: str = "<|editable_region_start|>") -> dict:
2399
+ """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
2400
+
2401
+ This function formats examples from the NEP dataset
2402
+ into prompts and expected outputs. It could be
2403
+ further extended to support more NEP datasets.
2404
+
2405
+ Args:
2406
+ sample: The dataset sample containing events,
2407
+ inputs, and outputs.
2408
+ original_start_marker: The marker indicating the
2409
+ start of the editable region. Defaults to
2410
+ "<|editable_region_start|>".
2411
+
2412
+ Returns:
2413
+ A dictionary with the formatted prompts and expected outputs.
2414
+ """
2415
+ events = sample["events"]
2416
+ input = sample["input"]
2417
+ output = sample["output"]
2418
+ prompt = zeta_prompt.format(events, input)
2419
+
2420
+ # following the original implementation, extract the focused region
2421
+ # from the raw output
2422
+ output_start_index = output.find(original_start_marker)
2423
+ output_focused_region = output[output_start_index:]
2424
+ expected_output = output_focused_region
2425
+
2426
+ return {"prompt": prompt, "expected_output": expected_output}
2427
+
2428
+
2429
+ class NextEditPredictionDataset(HuggingFaceDataset):
2430
+ """
2431
+ Dataset class for processing a Next Edit Prediction dataset.
2432
+ """
2433
+
2434
+ SUPPORTED_DATASET_PATHS = {
2435
+ "zed-industries/zeta",
2436
+ }
2437
+ MAPPING_PROMPT_FUNCS = {
2438
+ "zed-industries/zeta": _format_zeta_prompt,
2439
+ }
2440
+
2441
+ def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
2442
+ request_id_prefix: str = "",
2443
+ no_oversample: bool = False,
2444
+ **kwargs):
2445
+ formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
2446
+ if formatting_prompt_func is None:
2447
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2448
+ samples = []
2449
+ for i, sample in enumerate(self.data):
2450
+ sample = formatting_prompt_func(sample)
2451
+ samples.append(
2452
+ SampleRequest(
2453
+ prompt=sample["prompt"],
2454
+ prompt_len=len(tokenizer(sample["prompt"]).input_ids),
2455
+ expected_output_len=len(
2456
+ tokenizer(sample["expected_output"]).input_ids),
2457
+ request_id=request_id_prefix + str(i),
2458
+ ))
2459
+ if len(samples) >= num_requests:
2460
+ break
2461
+ self.maybe_oversample_requests(samples,
2462
+ num_requests,
2463
+ request_id_prefix,
2464
+ no_oversample)
2465
+ return samples
2466
+
2467
+
2468
+ # -----------------------------------------------------------------------------
2469
+ # ASR Dataset Implementation
2470
+ # -----------------------------------------------------------------------------
2471
+
2472
+
2473
+ class ASRDataset(HuggingFaceDataset):
2474
+ """
2475
+ Dataset class for processing a ASR dataset for transcription.
2476
+ Tested on the following set:
2477
+
2478
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2479
+ | Dataset | Domain | Speaking Style | hf-subset |
2480
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2481
+ | TED-LIUM | TED talks | Oratory | release1, release2, release3|
2482
+ | | | | release3-speaker-adaptation |
2483
+ | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
2484
+ | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
2485
+ | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
2486
+ | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
2487
+ | AMI | Meetings | Spontaneous | ihm, sdm |
2488
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2489
+
2490
+ """ # noqa: E501
2491
+
2492
+ SUPPORTED_DATASET_PATHS = {
2493
+ "openslr/librispeech_asr",
2494
+ "facebook/voxpopuli",
2495
+ "LIUM/tedlium",
2496
+ "edinburghcstr/ami",
2497
+ "speechcolab/gigaspeech",
2498
+ "kensho/spgispeech",
2499
+ }
2500
+
2501
+ DEFAULT_OUTPUT_LEN = 128
2502
+ IS_MULTIMODAL = True
2503
+
2504
+ # TODO Whisper-specific. Abstract interface when more models are supported.
2505
+ TRANSCRIPTION_PREAMBLE = (
2506
+ "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>")
2507
+ skip_long_audios: bool = True
2508
+
2509
+ def sample(
2510
+ self,
2511
+ tokenizer: PreTrainedTokenizerBase,
2512
+ num_requests: int,
2513
+ output_len: Optional[int] = None,
2514
+ request_id_prefix: str = "",
2515
+ no_oversample: bool = False,
2516
+ **kwargs,
2517
+ ) -> list:
2518
+ output_len = (output_len
2519
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2520
+ prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
2521
+ prompt_len = len(tokenizer(prompt).input_ids)
2522
+ sampled_requests = []
2523
+ ind = 0
2524
+ skipped = 0
2525
+ for item in self.data:
2526
+ if len(sampled_requests) >= num_requests:
2527
+ break
2528
+ audio = item["audio"]
2529
+ y, sr = audio["array"], audio["sampling_rate"]
2530
+ duration_s = librosa.get_duration(y=y, sr=sr)
2531
+ # Whisper max supported duration
2532
+ if self.skip_long_audios and duration_s > 30:
2533
+ skipped += 1
2534
+ continue
2535
+
2536
+ mm_content = {"audio": (y, sr)}
2537
+ sampled_requests.append(
2538
+ SampleRequest(
2539
+ prompt=prompt,
2540
+ prompt_len=prompt_len,
2541
+ expected_output_len=output_len,
2542
+ multi_modal_data=mm_content,
2543
+ request_id=request_id_prefix + str(ind),
2544
+ ))
2545
+ ind += 1
2546
+ if skipped:
2547
+ logger.warning(
2548
+ "%d samples discarded from dataset due to"
2549
+ " their length being greater than"
2550
+ " what Whisper supports.",
2551
+ skipped,
2552
+ )
2553
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2554
+ request_id_prefix, no_oversample)
2555
+ return sampled_requests
2556
+
2557
+
2558
+ # -----------------------------------------------------------------------------
2559
+ # MLPerf Dataset Implementation
2560
+ # -----------------------------------------------------------------------------
2561
+
2562
+
2563
+ class MLPerfDataset(HuggingFaceDataset):
2564
+ """
2565
+ MLPerf Inference Dataset.
2566
+
2567
+ Dataset on HF:
2568
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data
2569
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data
2570
+
2571
+ Each record contains:
2572
+ - "system_prompt": system role instruction.
2573
+ - "question": user question.
2574
+ - "output": reference answer.
2575
+
2576
+ We combine the system prompt and question into a chat-formatted prompt
2577
+ (using the tokenizer's chat template) and set the expected output length to
2578
+ the tokenized length of the provided reference answer.
2579
+ """
2580
+
2581
+ SUPPORTED_DATASET_PATHS = {
2582
+ "mgoin/mlperf-inference-llama2-data",
2583
+ "mgoin/mlperf-inference-llama3.1-data",
2584
+ }
2585
+
2586
+ def sample(
2587
+ self,
2588
+ tokenizer: PreTrainedTokenizerBase,
2589
+ num_requests: int,
2590
+ output_len: Optional[int] = None,
2591
+ request_id_prefix: str = "",
2592
+ no_oversample: bool = False,
2593
+ **kwargs,
2594
+ ) -> list[SampleRequest]:
2595
+ # Force dynamic output length based on reference completion.
2596
+ dynamic_output = output_len is None
2597
+ sampled_requests: list[SampleRequest] = []
2598
+ ind = 0
2599
+
2600
+ for item in self.data:
2601
+ if len(sampled_requests) >= num_requests:
2602
+ break
2603
+
2604
+ system_prompt = item["system_prompt"]
2605
+ question = item["question"]
2606
+ reference_answer = item["output"]
2607
+
2608
+ # Build chat-style prompt using tokenizer template, if available.
2609
+ messages = [
2610
+ {"role": "system", "content": system_prompt},
2611
+ {"role": "user", "content": question},
2612
+ ]
2613
+ prompt_formatted = tokenizer.apply_chat_template(
2614
+ messages, add_generation_prompt=True, tokenize=False
2615
+ )
2616
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
2617
+
2618
+ # Determine output length from reference answer tokens.
2619
+ ref_out_len = len(
2620
+ tokenizer(reference_answer, add_special_tokens=False).input_ids
2621
+ )
2622
+ expected_output_len = ref_out_len if dynamic_output else output_len
2623
+
2624
+ # Validate sequence lengths.
2625
+ if not is_valid_sequence(prompt_len, expected_output_len):
2626
+ continue
2627
+
2628
+ sampled_requests.append(
2629
+ SampleRequest(
2630
+ prompt=prompt_formatted,
2631
+ prompt_len=prompt_len,
2632
+ expected_output_len=expected_output_len,
2633
+ request_id=request_id_prefix + str(ind),
2634
+ )
2635
+ )
2636
+ ind += 1
2637
+
2638
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2639
+ request_id_prefix, no_oversample)
2640
+ return sampled_requests
2641
+
2642
+
2643
+ # -----------------------------------------------------------------------------
2644
+ # Prefix Repetition Dataset Implementation
2645
+ # -----------------------------------------------------------------------------
2646
+
2647
+
2648
+ class PrefixRepetitionRandomDataset(BenchmarkDataset):
2649
+ # Default values copied from benchmark_serving.py for the repeated prefix
2650
+ # dataset.
2651
+ DEFAULT_PREFIX_LEN = 256
2652
+ DEFAULT_SUFFIX_LEN = 256
2653
+ DEFAULT_NUM_PREFIXES = 10
2654
+ DEFAULT_OUTPUT_LEN = 128
2655
+
2656
+ def __init__(
2657
+ self,
2658
+ **kwargs,
2659
+ ) -> None:
2660
+ super().__init__(**kwargs)
2661
+ random.seed(self.random_seed)
2662
+ np.random.seed(self.random_seed)
2663
+
2664
+ def sample(
2665
+ self,
2666
+ tokenizer: PreTrainedTokenizerBase,
2667
+ num_requests: int,
2668
+ prefix_len: int = DEFAULT_PREFIX_LEN,
2669
+ suffix_len: int = DEFAULT_SUFFIX_LEN,
2670
+ num_prefixes: int = DEFAULT_NUM_PREFIXES,
2671
+ output_len: int = DEFAULT_OUTPUT_LEN,
2672
+ request_id_prefix: str = "",
2673
+ no_oversample: bool = False,
2674
+ **kwargs,
2675
+ ) -> list[SampleRequest]:
2676
+ vocab_size = tokenizer.vocab_size
2677
+ prompts_per_prefix = num_requests // num_prefixes
2678
+ if prompts_per_prefix == 0:
2679
+ raise ValueError(
2680
+ f"num_requests ({num_requests}) must be greater than or equal "
2681
+ f"to num_prefixes ({num_prefixes})"
2682
+ )
2683
+
2684
+ def _generate_exact_length_tokens(target_length: int) -> list[int]:
2685
+ """Generate tokens that decode and re-encode to exactly
2686
+ target_length."""
2687
+ # Generate random tokens
2688
+ tokens = np.random.randint(
2689
+ 0, vocab_size, size=target_length).tolist()
2690
+ text = tokenizer.decode(tokens)
2691
+ re_encoded = tokenizer.encode(text, add_special_tokens=False)
2692
+
2693
+ if len(re_encoded) == target_length:
2694
+ return re_encoded
2695
+ elif len(re_encoded) < target_length:
2696
+ # Recursively generate additional consistent tokens
2697
+ needed = target_length - len(re_encoded)
2698
+ extra_tokens = _generate_exact_length_tokens(needed)
2699
+ return re_encoded + extra_tokens
2700
+ else:
2701
+ # Truncate to target length
2702
+ return re_encoded[:target_length]
2703
+
2704
+ requests = []
2705
+ for _ in range(num_prefixes):
2706
+ prefix_tokens = _generate_exact_length_tokens(prefix_len)
2707
+
2708
+ for _ in range(prompts_per_prefix):
2709
+ suffix_tokens = _generate_exact_length_tokens(suffix_len)
2710
+
2711
+ combined_tokens = prefix_tokens + suffix_tokens
2712
+ prompt = tokenizer.decode(combined_tokens)
2713
+ prompt_len = len(combined_tokens)
2714
+ requests.append(
2715
+ SampleRequest(
2716
+ prompt=prompt,
2717
+ prompt_len=prompt_len,
2718
+ expected_output_len=output_len,
2719
+ )
2720
+ )
2721
+
2722
+ random.shuffle(requests)
2723
+ return requests