vllm-cpu-avx512vnni 0.10.2.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1395) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +220 -0
  3. vllm/_bc_linter.py +59 -0
  4. vllm/_custom_ops.py +2022 -0
  5. vllm/_ipex_ops.py +404 -0
  6. vllm/_version.py +34 -0
  7. vllm/adapter_commons/__init__.py +0 -0
  8. vllm/adapter_commons/layers.py +16 -0
  9. vllm/adapter_commons/models.py +106 -0
  10. vllm/adapter_commons/request.py +26 -0
  11. vllm/adapter_commons/utils.py +93 -0
  12. vllm/adapter_commons/worker_manager.py +39 -0
  13. vllm/assets/__init__.py +0 -0
  14. vllm/assets/audio.py +45 -0
  15. vllm/assets/base.py +41 -0
  16. vllm/assets/image.py +50 -0
  17. vllm/assets/video.py +138 -0
  18. vllm/attention/__init__.py +19 -0
  19. vllm/attention/backends/__init__.py +0 -0
  20. vllm/attention/backends/abstract.py +348 -0
  21. vllm/attention/backends/differential_flash_attn.py +935 -0
  22. vllm/attention/backends/dual_chunk_flash_attn.py +1499 -0
  23. vllm/attention/backends/flash_attn.py +933 -0
  24. vllm/attention/backends/flashmla.py +238 -0
  25. vllm/attention/backends/mla/__init__.py +0 -0
  26. vllm/attention/backends/mla/common.py +1310 -0
  27. vllm/attention/backends/placeholder_attn.py +340 -0
  28. vllm/attention/backends/rocm_aiter_mla.py +410 -0
  29. vllm/attention/backends/rocm_flash_attn.py +953 -0
  30. vllm/attention/backends/triton_mla.py +111 -0
  31. vllm/attention/backends/utils.py +610 -0
  32. vllm/attention/backends/xformers.py +805 -0
  33. vllm/attention/layer.py +552 -0
  34. vllm/attention/layers/__init__.py +0 -0
  35. vllm/attention/layers/chunked_local_attention.py +91 -0
  36. vllm/attention/layers/cross_attention.py +159 -0
  37. vllm/attention/layers/encoder_only_attention.py +86 -0
  38. vllm/attention/ops/__init__.py +0 -0
  39. vllm/attention/ops/chunked_prefill_paged_decode.py +405 -0
  40. vllm/attention/ops/common.py +139 -0
  41. vllm/attention/ops/flashmla.py +123 -0
  42. vllm/attention/ops/merge_attn_states.py +43 -0
  43. vllm/attention/ops/paged_attn.py +261 -0
  44. vllm/attention/ops/pallas_kv_cache_update.py +124 -0
  45. vllm/attention/ops/prefix_prefill.py +928 -0
  46. vllm/attention/ops/rocm_aiter_mla.py +104 -0
  47. vllm/attention/ops/rocm_aiter_paged_attn.py +102 -0
  48. vllm/attention/ops/triton_decode_attention.py +676 -0
  49. vllm/attention/ops/triton_flash_attention.py +984 -0
  50. vllm/attention/ops/triton_merge_attn_states.py +97 -0
  51. vllm/attention/ops/triton_unified_attention.py +854 -0
  52. vllm/attention/selector.py +243 -0
  53. vllm/attention/utils/__init__.py +0 -0
  54. vllm/attention/utils/fa_utils.py +85 -0
  55. vllm/attention/utils/kv_sharing_utils.py +33 -0
  56. vllm/beam_search.py +87 -0
  57. vllm/benchmarks/__init__.py +0 -0
  58. vllm/benchmarks/datasets.py +2651 -0
  59. vllm/benchmarks/latency.py +170 -0
  60. vllm/benchmarks/lib/__init__.py +3 -0
  61. vllm/benchmarks/lib/endpoint_request_func.py +510 -0
  62. vllm/benchmarks/lib/ready_checker.py +72 -0
  63. vllm/benchmarks/lib/utils.py +80 -0
  64. vllm/benchmarks/serve.py +1247 -0
  65. vllm/benchmarks/throughput.py +696 -0
  66. vllm/collect_env.py +823 -0
  67. vllm/compilation/__init__.py +0 -0
  68. vllm/compilation/activation_quant_fusion.py +193 -0
  69. vllm/compilation/backends.py +641 -0
  70. vllm/compilation/base_static_graph.py +51 -0
  71. vllm/compilation/collective_fusion.py +1190 -0
  72. vllm/compilation/compiler_interface.py +572 -0
  73. vllm/compilation/counter.py +47 -0
  74. vllm/compilation/cuda_graph.py +193 -0
  75. vllm/compilation/cuda_piecewise_backend.py +117 -0
  76. vllm/compilation/decorators.py +316 -0
  77. vllm/compilation/fix_functionalization.py +208 -0
  78. vllm/compilation/fusion.py +600 -0
  79. vllm/compilation/fusion_attn.py +303 -0
  80. vllm/compilation/fx_utils.py +84 -0
  81. vllm/compilation/inductor_pass.py +136 -0
  82. vllm/compilation/monitor.py +57 -0
  83. vllm/compilation/multi_output_match.py +109 -0
  84. vllm/compilation/noop_elimination.py +165 -0
  85. vllm/compilation/pass_manager.py +88 -0
  86. vllm/compilation/sequence_parallelism.py +484 -0
  87. vllm/compilation/torch25_custom_graph_pass.py +42 -0
  88. vllm/compilation/vllm_inductor_pass.py +50 -0
  89. vllm/compilation/wrapper.py +138 -0
  90. vllm/config/__init__.py +3921 -0
  91. vllm/config/cache.py +214 -0
  92. vllm/config/compilation.py +580 -0
  93. vllm/config/kv_events.py +50 -0
  94. vllm/config/kv_transfer.py +111 -0
  95. vllm/config/load.py +113 -0
  96. vllm/config/lora.py +132 -0
  97. vllm/config/parallel.py +446 -0
  98. vllm/config/scheduler.py +304 -0
  99. vllm/config/utils.py +29 -0
  100. vllm/connections.py +174 -0
  101. vllm/core/__init__.py +0 -0
  102. vllm/core/block/__init__.py +0 -0
  103. vllm/core/block/block_table.py +399 -0
  104. vllm/core/block/common.py +371 -0
  105. vllm/core/block/cpu_gpu_block_allocator.py +439 -0
  106. vllm/core/block/interfaces.py +319 -0
  107. vllm/core/block/naive_block.py +466 -0
  108. vllm/core/block/prefix_caching_block.py +1135 -0
  109. vllm/core/block/utils.py +28 -0
  110. vllm/core/block_manager.py +523 -0
  111. vllm/core/evictor.py +157 -0
  112. vllm/core/interfaces.py +139 -0
  113. vllm/core/placeholder_block_space_manager.py +103 -0
  114. vllm/core/scheduler.py +2028 -0
  115. vllm/device_allocator/__init__.py +0 -0
  116. vllm/device_allocator/cumem.py +286 -0
  117. vllm/distributed/__init__.py +6 -0
  118. vllm/distributed/communication_op.py +41 -0
  119. vllm/distributed/device_communicators/__init__.py +0 -0
  120. vllm/distributed/device_communicators/all2all.py +259 -0
  121. vllm/distributed/device_communicators/all_reduce_utils.py +292 -0
  122. vllm/distributed/device_communicators/base_device_communicator.py +277 -0
  123. vllm/distributed/device_communicators/cpu_communicator.py +201 -0
  124. vllm/distributed/device_communicators/cuda_communicator.py +294 -0
  125. vllm/distributed/device_communicators/cuda_wrapper.py +180 -0
  126. vllm/distributed/device_communicators/custom_all_reduce.py +311 -0
  127. vllm/distributed/device_communicators/pynccl.py +290 -0
  128. vllm/distributed/device_communicators/pynccl_wrapper.py +382 -0
  129. vllm/distributed/device_communicators/quick_all_reduce.py +278 -0
  130. vllm/distributed/device_communicators/ray_communicator.py +258 -0
  131. vllm/distributed/device_communicators/shm_broadcast.py +585 -0
  132. vllm/distributed/device_communicators/symm_mem.py +136 -0
  133. vllm/distributed/device_communicators/tpu_communicator.py +102 -0
  134. vllm/distributed/device_communicators/xpu_communicator.py +69 -0
  135. vllm/distributed/eplb/__init__.py +8 -0
  136. vllm/distributed/eplb/eplb_state.py +619 -0
  137. vllm/distributed/eplb/rebalance_algo.py +234 -0
  138. vllm/distributed/eplb/rebalance_execute.py +424 -0
  139. vllm/distributed/kv_events.py +362 -0
  140. vllm/distributed/kv_transfer/README.md +29 -0
  141. vllm/distributed/kv_transfer/__init__.py +13 -0
  142. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  143. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  144. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  145. vllm/distributed/kv_transfer/kv_connector/factory.py +108 -0
  146. vllm/distributed/kv_transfer/kv_connector/utils.py +246 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +6 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/base.py +356 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +167 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +266 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +1319 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +484 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +542 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +266 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +414 -0
  157. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  158. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +175 -0
  159. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +161 -0
  160. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +237 -0
  161. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  162. vllm/distributed/kv_transfer/kv_pipe/base.py +67 -0
  163. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +290 -0
  164. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +280 -0
  165. vllm/distributed/kv_transfer/kv_transfer_state.py +73 -0
  166. vllm/distributed/parallel_state.py +1489 -0
  167. vllm/distributed/tpu_distributed_utils.py +178 -0
  168. vllm/distributed/utils.py +536 -0
  169. vllm/engine/__init__.py +0 -0
  170. vllm/engine/arg_utils.py +1857 -0
  171. vllm/engine/async_llm_engine.py +1044 -0
  172. vllm/engine/async_timeout.py +173 -0
  173. vllm/engine/llm_engine.py +1849 -0
  174. vllm/engine/metrics.py +577 -0
  175. vllm/engine/metrics_types.py +84 -0
  176. vllm/engine/multiprocessing/__init__.py +145 -0
  177. vllm/engine/multiprocessing/client.py +643 -0
  178. vllm/engine/multiprocessing/engine.py +470 -0
  179. vllm/engine/output_processor/__init__.py +0 -0
  180. vllm/engine/output_processor/interfaces.py +61 -0
  181. vllm/engine/output_processor/single_step.py +145 -0
  182. vllm/engine/output_processor/stop_checker.py +131 -0
  183. vllm/engine/output_processor/util.py +28 -0
  184. vllm/engine/protocol.py +343 -0
  185. vllm/entrypoints/__init__.py +0 -0
  186. vllm/entrypoints/api_server.py +178 -0
  187. vllm/entrypoints/chat_utils.py +1535 -0
  188. vllm/entrypoints/cli/__init__.py +12 -0
  189. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  190. vllm/entrypoints/cli/benchmark/base.py +25 -0
  191. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  192. vllm/entrypoints/cli/benchmark/main.py +58 -0
  193. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  194. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  195. vllm/entrypoints/cli/collect_env.py +36 -0
  196. vllm/entrypoints/cli/main.py +60 -0
  197. vllm/entrypoints/cli/openai.py +214 -0
  198. vllm/entrypoints/cli/run_batch.py +69 -0
  199. vllm/entrypoints/cli/serve.py +232 -0
  200. vllm/entrypoints/cli/types.py +29 -0
  201. vllm/entrypoints/constants.py +10 -0
  202. vllm/entrypoints/context.py +444 -0
  203. vllm/entrypoints/harmony_utils.py +431 -0
  204. vllm/entrypoints/launcher.py +168 -0
  205. vllm/entrypoints/llm.py +1579 -0
  206. vllm/entrypoints/logger.py +79 -0
  207. vllm/entrypoints/openai/__init__.py +0 -0
  208. vllm/entrypoints/openai/api_server.py +2011 -0
  209. vllm/entrypoints/openai/cli_args.py +281 -0
  210. vllm/entrypoints/openai/logits_processors.py +90 -0
  211. vllm/entrypoints/openai/protocol.py +2590 -0
  212. vllm/entrypoints/openai/run_batch.py +497 -0
  213. vllm/entrypoints/openai/serving_chat.py +1591 -0
  214. vllm/entrypoints/openai/serving_classification.py +176 -0
  215. vllm/entrypoints/openai/serving_completion.py +688 -0
  216. vllm/entrypoints/openai/serving_embedding.py +632 -0
  217. vllm/entrypoints/openai/serving_engine.py +996 -0
  218. vllm/entrypoints/openai/serving_models.py +288 -0
  219. vllm/entrypoints/openai/serving_pooling.py +277 -0
  220. vllm/entrypoints/openai/serving_responses.py +1690 -0
  221. vllm/entrypoints/openai/serving_score.py +479 -0
  222. vllm/entrypoints/openai/serving_tokenization.py +196 -0
  223. vllm/entrypoints/openai/serving_transcription.py +136 -0
  224. vllm/entrypoints/openai/speech_to_text.py +388 -0
  225. vllm/entrypoints/openai/tool_parsers/__init__.py +51 -0
  226. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +164 -0
  227. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +367 -0
  228. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +370 -0
  229. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +185 -0
  230. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +259 -0
  231. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +237 -0
  232. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +418 -0
  233. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +372 -0
  234. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +216 -0
  235. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +308 -0
  236. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +377 -0
  237. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +316 -0
  238. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +269 -0
  239. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +816 -0
  240. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +369 -0
  241. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +73 -0
  242. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +112 -0
  243. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +308 -0
  244. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +707 -0
  245. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +679 -0
  246. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +296 -0
  247. vllm/entrypoints/openai/tool_parsers/utils.py +124 -0
  248. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +524 -0
  249. vllm/entrypoints/renderer.py +395 -0
  250. vllm/entrypoints/score_utils.py +232 -0
  251. vllm/entrypoints/ssl.py +75 -0
  252. vllm/entrypoints/tool.py +139 -0
  253. vllm/entrypoints/tool_server.py +195 -0
  254. vllm/entrypoints/utils.py +328 -0
  255. vllm/env_override.py +23 -0
  256. vllm/envs.py +1354 -0
  257. vllm/executor/__init__.py +0 -0
  258. vllm/executor/executor_base.py +378 -0
  259. vllm/executor/mp_distributed_executor.py +244 -0
  260. vllm/executor/msgspec_utils.py +35 -0
  261. vllm/executor/multiproc_worker_utils.py +279 -0
  262. vllm/executor/ray_distributed_executor.py +699 -0
  263. vllm/executor/ray_utils.py +410 -0
  264. vllm/executor/uniproc_executor.py +152 -0
  265. vllm/forward_context.py +273 -0
  266. vllm/inputs/__init__.py +44 -0
  267. vllm/inputs/data.py +356 -0
  268. vllm/inputs/parse.py +151 -0
  269. vllm/inputs/preprocess.py +973 -0
  270. vllm/inputs/registry.py +251 -0
  271. vllm/logger.py +229 -0
  272. vllm/logging_utils/__init__.py +8 -0
  273. vllm/logging_utils/dump_input.py +81 -0
  274. vllm/logging_utils/formatter.py +79 -0
  275. vllm/logits_process.py +119 -0
  276. vllm/logprobs.py +28 -0
  277. vllm/lora/__init__.py +0 -0
  278. vllm/lora/layers/__init__.py +34 -0
  279. vllm/lora/layers/base.py +69 -0
  280. vllm/lora/layers/base_linear.py +184 -0
  281. vllm/lora/layers/column_parallel_linear.py +622 -0
  282. vllm/lora/layers/logits_processor.py +247 -0
  283. vllm/lora/layers/qkv_x_parallel_linear.py +8 -0
  284. vllm/lora/layers/replicated_linear.py +61 -0
  285. vllm/lora/layers/row_parallel_linear.py +201 -0
  286. vllm/lora/layers/utils.py +60 -0
  287. vllm/lora/layers/vocal_parallel_embedding.py +172 -0
  288. vllm/lora/lora.py +199 -0
  289. vllm/lora/models.py +792 -0
  290. vllm/lora/ops/__init__.py +0 -0
  291. vllm/lora/ops/ipex_ops/__init__.py +7 -0
  292. vllm/lora/ops/ipex_ops/lora_ops.py +44 -0
  293. vllm/lora/ops/torch_ops/__init__.py +16 -0
  294. vllm/lora/ops/torch_ops/lora_ops.py +119 -0
  295. vllm/lora/ops/triton_ops/__init__.py +12 -0
  296. vllm/lora/ops/triton_ops/kernel_utils.py +243 -0
  297. vllm/lora/ops/triton_ops/lora_expand_op.py +291 -0
  298. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +148 -0
  299. vllm/lora/ops/triton_ops/lora_shrink_op.py +245 -0
  300. vllm/lora/ops/triton_ops/utils.py +126 -0
  301. vllm/lora/ops/xla_ops/__init__.py +7 -0
  302. vllm/lora/ops/xla_ops/lora_ops.py +145 -0
  303. vllm/lora/peft_helper.py +127 -0
  304. vllm/lora/punica_wrapper/__init__.py +10 -0
  305. vllm/lora/punica_wrapper/punica_base.py +458 -0
  306. vllm/lora/punica_wrapper/punica_cpu.py +349 -0
  307. vllm/lora/punica_wrapper/punica_gpu.py +279 -0
  308. vllm/lora/punica_wrapper/punica_selector.py +20 -0
  309. vllm/lora/punica_wrapper/punica_tpu.py +391 -0
  310. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  311. vllm/lora/punica_wrapper/utils.py +136 -0
  312. vllm/lora/request.py +99 -0
  313. vllm/lora/resolver.py +85 -0
  314. vllm/lora/utils.py +246 -0
  315. vllm/lora/worker_manager.py +256 -0
  316. vllm/model_executor/__init__.py +16 -0
  317. vllm/model_executor/custom_op.py +194 -0
  318. vllm/model_executor/layers/__init__.py +0 -0
  319. vllm/model_executor/layers/activation.py +575 -0
  320. vllm/model_executor/layers/attention_layer_base.py +23 -0
  321. vllm/model_executor/layers/fla/__init__.py +8 -0
  322. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  323. vllm/model_executor/layers/fla/ops/chunk.py +225 -0
  324. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +290 -0
  325. vllm/model_executor/layers/fla/ops/chunk_o.py +177 -0
  326. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +140 -0
  327. vllm/model_executor/layers/fla/ops/cumsum.py +226 -0
  328. vllm/model_executor/layers/fla/ops/fused_recurrent.py +366 -0
  329. vllm/model_executor/layers/fla/ops/index.py +39 -0
  330. vllm/model_executor/layers/fla/ops/l2norm.py +143 -0
  331. vllm/model_executor/layers/fla/ops/layernorm_guard.py +337 -0
  332. vllm/model_executor/layers/fla/ops/op.py +39 -0
  333. vllm/model_executor/layers/fla/ops/solve_tril.py +365 -0
  334. vllm/model_executor/layers/fla/ops/utils.py +180 -0
  335. vllm/model_executor/layers/fla/ops/wy_fast.py +114 -0
  336. vllm/model_executor/layers/fused_moe/__init__.py +80 -0
  337. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +304 -0
  338. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +164 -0
  339. vllm/model_executor/layers/fused_moe/config.py +497 -0
  340. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  341. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  342. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  343. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  344. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  345. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
  346. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  347. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  348. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  349. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  350. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  351. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  352. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  560. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +297 -0
  561. vllm/model_executor/layers/fused_moe/cutlass_moe.py +996 -0
  562. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +370 -0
  563. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +413 -0
  564. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +280 -0
  565. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +229 -0
  566. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +243 -0
  567. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +97 -0
  568. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1042 -0
  569. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +240 -0
  570. vllm/model_executor/layers/fused_moe/fused_moe.py +2081 -0
  571. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +247 -0
  572. vllm/model_executor/layers/fused_moe/layer.py +1951 -0
  573. vllm/model_executor/layers/fused_moe/modular_kernel.py +892 -0
  574. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +87 -0
  575. vllm/model_executor/layers/fused_moe/moe_pallas.py +80 -0
  576. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +205 -0
  577. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  578. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +321 -0
  579. vllm/model_executor/layers/fused_moe/prepare_finalize.py +72 -0
  580. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +431 -0
  581. vllm/model_executor/layers/fused_moe/routing_simulator.py +291 -0
  582. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +146 -0
  583. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +171 -0
  584. vllm/model_executor/layers/fused_moe/trtllm_moe.py +197 -0
  585. vllm/model_executor/layers/fused_moe/utils.py +270 -0
  586. vllm/model_executor/layers/layernorm.py +381 -0
  587. vllm/model_executor/layers/lightning_attn.py +661 -0
  588. vllm/model_executor/layers/linear.py +1567 -0
  589. vllm/model_executor/layers/logits_processor.py +199 -0
  590. vllm/model_executor/layers/mamba/__init__.py +0 -0
  591. vllm/model_executor/layers/mamba/abstract.py +45 -0
  592. vllm/model_executor/layers/mamba/linear_attn.py +432 -0
  593. vllm/model_executor/layers/mamba/mamba2_metadata.py +186 -0
  594. vllm/model_executor/layers/mamba/mamba_mixer.py +517 -0
  595. vllm/model_executor/layers/mamba/mamba_mixer2.py +803 -0
  596. vllm/model_executor/layers/mamba/mamba_utils.py +202 -0
  597. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  598. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +982 -0
  599. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +168 -0
  600. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +414 -0
  601. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +262 -0
  602. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +574 -0
  603. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +751 -0
  604. vllm/model_executor/layers/mamba/ops/ssd_combined.py +248 -0
  605. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +248 -0
  606. vllm/model_executor/layers/mamba/short_conv.py +270 -0
  607. vllm/model_executor/layers/mla.py +158 -0
  608. vllm/model_executor/layers/pooler.py +732 -0
  609. vllm/model_executor/layers/quantization/__init__.py +157 -0
  610. vllm/model_executor/layers/quantization/auto_round.py +388 -0
  611. vllm/model_executor/layers/quantization/awq.py +228 -0
  612. vllm/model_executor/layers/quantization/awq_marlin.py +548 -0
  613. vllm/model_executor/layers/quantization/awq_triton.py +320 -0
  614. vllm/model_executor/layers/quantization/base_config.py +164 -0
  615. vllm/model_executor/layers/quantization/bitblas.py +464 -0
  616. vllm/model_executor/layers/quantization/bitsandbytes.py +621 -0
  617. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +0 -0
  618. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +795 -0
  619. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +1651 -0
  620. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +27 -0
  621. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +366 -0
  622. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  623. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +160 -0
  624. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +105 -0
  625. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +161 -0
  626. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +169 -0
  627. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +135 -0
  628. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +121 -0
  629. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +156 -0
  630. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +111 -0
  631. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +201 -0
  632. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +227 -0
  633. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +135 -0
  634. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +21 -0
  635. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  636. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +206 -0
  637. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  638. vllm/model_executor/layers/quantization/deepgemm.py +81 -0
  639. vllm/model_executor/layers/quantization/deepspeedfp.py +196 -0
  640. vllm/model_executor/layers/quantization/experts_int8.py +215 -0
  641. vllm/model_executor/layers/quantization/fbgemm_fp8.py +172 -0
  642. vllm/model_executor/layers/quantization/fp8.py +1179 -0
  643. vllm/model_executor/layers/quantization/gguf.py +597 -0
  644. vllm/model_executor/layers/quantization/gptq.py +300 -0
  645. vllm/model_executor/layers/quantization/gptq_bitblas.py +448 -0
  646. vllm/model_executor/layers/quantization/gptq_marlin.py +700 -0
  647. vllm/model_executor/layers/quantization/gptq_marlin_24.py +297 -0
  648. vllm/model_executor/layers/quantization/hqq_marlin.py +333 -0
  649. vllm/model_executor/layers/quantization/inc.py +61 -0
  650. vllm/model_executor/layers/quantization/input_quant_fp8.py +103 -0
  651. vllm/model_executor/layers/quantization/ipex_quant.py +410 -0
  652. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  653. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +91 -0
  654. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +93 -0
  655. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +116 -0
  656. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +302 -0
  657. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +92 -0
  658. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +117 -0
  659. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +92 -0
  660. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +143 -0
  661. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +144 -0
  662. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +139 -0
  663. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +67 -0
  664. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +89 -0
  665. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +163 -0
  666. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +206 -0
  667. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +137 -0
  668. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +41 -0
  669. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +104 -0
  670. vllm/model_executor/layers/quantization/kv_cache.py +139 -0
  671. vllm/model_executor/layers/quantization/modelopt.py +1548 -0
  672. vllm/model_executor/layers/quantization/moe_wna16.py +473 -0
  673. vllm/model_executor/layers/quantization/mxfp4.py +951 -0
  674. vllm/model_executor/layers/quantization/petit.py +306 -0
  675. vllm/model_executor/layers/quantization/ptpc_fp8.py +129 -0
  676. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  677. vllm/model_executor/layers/quantization/quark/quark.py +431 -0
  678. vllm/model_executor/layers/quantization/quark/quark_moe.py +434 -0
  679. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  680. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  681. vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +112 -0
  682. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +163 -0
  683. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +122 -0
  684. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  685. vllm/model_executor/layers/quantization/rtn.py +456 -0
  686. vllm/model_executor/layers/quantization/schema.py +86 -0
  687. vllm/model_executor/layers/quantization/torchao.py +214 -0
  688. vllm/model_executor/layers/quantization/tpu_int8.py +125 -0
  689. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  690. vllm/model_executor/layers/quantization/utils/allspark_utils.py +52 -0
  691. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +210 -0
  692. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  693. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  694. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  695. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  696. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  697. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  698. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  699. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  700. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  701. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  702. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  703. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  704. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  705. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  706. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  707. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  708. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  709. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  710. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  711. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  712. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  713. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  714. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  715. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  716. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  717. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  718. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  719. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  720. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  721. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  722. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  723. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  724. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  725. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  726. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  727. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  728. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  729. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  730. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  731. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  732. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  733. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  734. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  735. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  736. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  737. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  738. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  739. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  740. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  741. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  742. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  743. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  744. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  745. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  746. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  747. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  748. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  749. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  750. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  751. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  752. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  753. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  754. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  755. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  756. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  757. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  758. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  759. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  760. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  761. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  762. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  763. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  764. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  902. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  903. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +85 -0
  904. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +258 -0
  905. vllm/model_executor/layers/quantization/utils/fp8_utils.py +795 -0
  906. vllm/model_executor/layers/quantization/utils/gptq_utils.py +96 -0
  907. vllm/model_executor/layers/quantization/utils/int8_utils.py +492 -0
  908. vllm/model_executor/layers/quantization/utils/layer_utils.py +40 -0
  909. vllm/model_executor/layers/quantization/utils/machete_utils.py +50 -0
  910. vllm/model_executor/layers/quantization/utils/marlin_utils.py +479 -0
  911. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +396 -0
  912. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +345 -0
  913. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +165 -0
  914. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +464 -0
  915. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +132 -0
  916. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +20 -0
  917. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +137 -0
  918. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +59 -0
  919. vllm/model_executor/layers/quantization/utils/petit_utils.py +122 -0
  920. vllm/model_executor/layers/quantization/utils/quant_utils.py +627 -0
  921. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +458 -0
  922. vllm/model_executor/layers/resampler.py +270 -0
  923. vllm/model_executor/layers/rotary_embedding/__init__.py +190 -0
  924. vllm/model_executor/layers/rotary_embedding/base.py +156 -0
  925. vllm/model_executor/layers/rotary_embedding/common.py +105 -0
  926. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +140 -0
  927. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +197 -0
  928. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +41 -0
  929. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +67 -0
  930. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +80 -0
  931. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  932. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  933. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +81 -0
  934. vllm/model_executor/layers/rotary_embedding/mrope.py +1140 -0
  935. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +42 -0
  936. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +129 -0
  937. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +68 -0
  938. vllm/model_executor/layers/sampler.py +1198 -0
  939. vllm/model_executor/layers/shared_fused_moe/__init__.py +6 -0
  940. vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py +56 -0
  941. vllm/model_executor/layers/utils.py +196 -0
  942. vllm/model_executor/layers/vocab_parallel_embedding.py +487 -0
  943. vllm/model_executor/model_loader/__init__.py +138 -0
  944. vllm/model_executor/model_loader/base_loader.py +52 -0
  945. vllm/model_executor/model_loader/bitsandbytes_loader.py +787 -0
  946. vllm/model_executor/model_loader/default_loader.py +278 -0
  947. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  948. vllm/model_executor/model_loader/gguf_loader.py +155 -0
  949. vllm/model_executor/model_loader/runai_streamer_loader.py +104 -0
  950. vllm/model_executor/model_loader/sharded_state_loader.py +199 -0
  951. vllm/model_executor/model_loader/tensorizer.py +743 -0
  952. vllm/model_executor/model_loader/tensorizer_loader.py +143 -0
  953. vllm/model_executor/model_loader/tpu.py +114 -0
  954. vllm/model_executor/model_loader/utils.py +271 -0
  955. vllm/model_executor/model_loader/weight_utils.py +946 -0
  956. vllm/model_executor/models/__init__.py +30 -0
  957. vllm/model_executor/models/adapters.py +542 -0
  958. vllm/model_executor/models/aimv2.py +246 -0
  959. vllm/model_executor/models/apertus.py +582 -0
  960. vllm/model_executor/models/arcee.py +423 -0
  961. vllm/model_executor/models/arctic.py +560 -0
  962. vllm/model_executor/models/aria.py +662 -0
  963. vllm/model_executor/models/aya_vision.py +470 -0
  964. vllm/model_executor/models/baichuan.py +475 -0
  965. vllm/model_executor/models/bailing_moe.py +529 -0
  966. vllm/model_executor/models/bamba.py +582 -0
  967. vllm/model_executor/models/bart.py +1343 -0
  968. vllm/model_executor/models/bert.py +613 -0
  969. vllm/model_executor/models/bert_with_rope.py +687 -0
  970. vllm/model_executor/models/blip.py +339 -0
  971. vllm/model_executor/models/blip2.py +716 -0
  972. vllm/model_executor/models/bloom.py +374 -0
  973. vllm/model_executor/models/chameleon.py +1141 -0
  974. vllm/model_executor/models/chatglm.py +479 -0
  975. vllm/model_executor/models/clip.py +407 -0
  976. vllm/model_executor/models/cohere2_vision.py +484 -0
  977. vllm/model_executor/models/commandr.py +467 -0
  978. vllm/model_executor/models/config.py +434 -0
  979. vllm/model_executor/models/constant_size_cache.py +137 -0
  980. vllm/model_executor/models/dbrx.py +473 -0
  981. vllm/model_executor/models/deepseek.py +491 -0
  982. vllm/model_executor/models/deepseek_eagle.py +241 -0
  983. vllm/model_executor/models/deepseek_mtp.py +282 -0
  984. vllm/model_executor/models/deepseek_v2.py +1058 -0
  985. vllm/model_executor/models/deepseek_vl2.py +661 -0
  986. vllm/model_executor/models/donut.py +387 -0
  987. vllm/model_executor/models/dots1.py +547 -0
  988. vllm/model_executor/models/ernie45.py +43 -0
  989. vllm/model_executor/models/ernie45_moe.py +608 -0
  990. vllm/model_executor/models/ernie45_vl.py +1510 -0
  991. vllm/model_executor/models/ernie45_vl_moe.py +728 -0
  992. vllm/model_executor/models/ernie_mtp.py +287 -0
  993. vllm/model_executor/models/exaone.py +552 -0
  994. vllm/model_executor/models/exaone4.py +535 -0
  995. vllm/model_executor/models/fairseq2_llama.py +154 -0
  996. vllm/model_executor/models/falcon.py +511 -0
  997. vllm/model_executor/models/falcon_h1.py +739 -0
  998. vllm/model_executor/models/florence2.py +1107 -0
  999. vllm/model_executor/models/fuyu.py +401 -0
  1000. vllm/model_executor/models/gemma.py +428 -0
  1001. vllm/model_executor/models/gemma2.py +425 -0
  1002. vllm/model_executor/models/gemma3.py +542 -0
  1003. vllm/model_executor/models/gemma3_mm.py +723 -0
  1004. vllm/model_executor/models/gemma3n.py +830 -0
  1005. vllm/model_executor/models/gemma3n_mm.py +767 -0
  1006. vllm/model_executor/models/glm.py +23 -0
  1007. vllm/model_executor/models/glm4.py +305 -0
  1008. vllm/model_executor/models/glm4_1v.py +1669 -0
  1009. vllm/model_executor/models/glm4_moe.py +703 -0
  1010. vllm/model_executor/models/glm4_moe_mtp.py +306 -0
  1011. vllm/model_executor/models/glm4v.py +654 -0
  1012. vllm/model_executor/models/gpt2.py +383 -0
  1013. vllm/model_executor/models/gpt_bigcode.py +346 -0
  1014. vllm/model_executor/models/gpt_j.py +340 -0
  1015. vllm/model_executor/models/gpt_neox.py +333 -0
  1016. vllm/model_executor/models/gpt_oss.py +687 -0
  1017. vllm/model_executor/models/granite.py +498 -0
  1018. vllm/model_executor/models/granite_speech.py +799 -0
  1019. vllm/model_executor/models/granitemoe.py +541 -0
  1020. vllm/model_executor/models/granitemoehybrid.py +684 -0
  1021. vllm/model_executor/models/granitemoeshared.py +342 -0
  1022. vllm/model_executor/models/gritlm.py +262 -0
  1023. vllm/model_executor/models/grok1.py +550 -0
  1024. vllm/model_executor/models/h2ovl.py +536 -0
  1025. vllm/model_executor/models/hunyuan_v1.py +937 -0
  1026. vllm/model_executor/models/hyperclovax_vision.py +1206 -0
  1027. vllm/model_executor/models/idefics2_vision_model.py +416 -0
  1028. vllm/model_executor/models/idefics3.py +758 -0
  1029. vllm/model_executor/models/interfaces.py +854 -0
  1030. vllm/model_executor/models/interfaces_base.py +195 -0
  1031. vllm/model_executor/models/intern_vit.py +481 -0
  1032. vllm/model_executor/models/internlm2.py +453 -0
  1033. vllm/model_executor/models/internlm2_ve.py +148 -0
  1034. vllm/model_executor/models/interns1.py +832 -0
  1035. vllm/model_executor/models/interns1_vit.py +418 -0
  1036. vllm/model_executor/models/internvl.py +1423 -0
  1037. vllm/model_executor/models/jais.py +374 -0
  1038. vllm/model_executor/models/jamba.py +630 -0
  1039. vllm/model_executor/models/jina_vl.py +144 -0
  1040. vllm/model_executor/models/keye.py +1684 -0
  1041. vllm/model_executor/models/keye_vl1_5.py +601 -0
  1042. vllm/model_executor/models/kimi_vl.py +620 -0
  1043. vllm/model_executor/models/lfm2.py +558 -0
  1044. vllm/model_executor/models/llama.py +671 -0
  1045. vllm/model_executor/models/llama4.py +732 -0
  1046. vllm/model_executor/models/llama4_eagle.py +241 -0
  1047. vllm/model_executor/models/llama_eagle.py +171 -0
  1048. vllm/model_executor/models/llama_eagle3.py +292 -0
  1049. vllm/model_executor/models/llava.py +872 -0
  1050. vllm/model_executor/models/llava_next.py +572 -0
  1051. vllm/model_executor/models/llava_next_video.py +479 -0
  1052. vllm/model_executor/models/llava_onevision.py +945 -0
  1053. vllm/model_executor/models/mamba.py +310 -0
  1054. vllm/model_executor/models/mamba2.py +346 -0
  1055. vllm/model_executor/models/mamba_cache.py +83 -0
  1056. vllm/model_executor/models/medusa.py +219 -0
  1057. vllm/model_executor/models/midashenglm.py +788 -0
  1058. vllm/model_executor/models/mimo.py +191 -0
  1059. vllm/model_executor/models/mimo_mtp.py +273 -0
  1060. vllm/model_executor/models/minicpm.py +593 -0
  1061. vllm/model_executor/models/minicpm3.py +230 -0
  1062. vllm/model_executor/models/minicpm_eagle.py +391 -0
  1063. vllm/model_executor/models/minicpmo.py +804 -0
  1064. vllm/model_executor/models/minicpmv.py +1786 -0
  1065. vllm/model_executor/models/minimax_cache.py +36 -0
  1066. vllm/model_executor/models/minimax_text_01.py +1027 -0
  1067. vllm/model_executor/models/minimax_vl_01.py +431 -0
  1068. vllm/model_executor/models/mistral3.py +628 -0
  1069. vllm/model_executor/models/mixtral.py +494 -0
  1070. vllm/model_executor/models/mllama.py +1697 -0
  1071. vllm/model_executor/models/mllama4.py +1079 -0
  1072. vllm/model_executor/models/mlp_speculator.py +206 -0
  1073. vllm/model_executor/models/modernbert.py +374 -0
  1074. vllm/model_executor/models/module_mapping.py +72 -0
  1075. vllm/model_executor/models/molmo.py +1569 -0
  1076. vllm/model_executor/models/moonvit.py +663 -0
  1077. vllm/model_executor/models/motif.py +345 -0
  1078. vllm/model_executor/models/mpt.py +332 -0
  1079. vllm/model_executor/models/nano_nemotron_vl.py +1395 -0
  1080. vllm/model_executor/models/nemotron.py +509 -0
  1081. vllm/model_executor/models/nemotron_h.py +633 -0
  1082. vllm/model_executor/models/nemotron_nas.py +484 -0
  1083. vllm/model_executor/models/nemotron_vl.py +655 -0
  1084. vllm/model_executor/models/nvlm_d.py +203 -0
  1085. vllm/model_executor/models/olmo.py +406 -0
  1086. vllm/model_executor/models/olmo2.py +428 -0
  1087. vllm/model_executor/models/olmoe.py +485 -0
  1088. vllm/model_executor/models/opt.py +413 -0
  1089. vllm/model_executor/models/orion.py +350 -0
  1090. vllm/model_executor/models/ovis.py +572 -0
  1091. vllm/model_executor/models/ovis2_5.py +644 -0
  1092. vllm/model_executor/models/paligemma.py +414 -0
  1093. vllm/model_executor/models/persimmon.py +345 -0
  1094. vllm/model_executor/models/phi.py +357 -0
  1095. vllm/model_executor/models/phi3.py +19 -0
  1096. vllm/model_executor/models/phi3v.py +701 -0
  1097. vllm/model_executor/models/phi4_multimodal.py +1478 -0
  1098. vllm/model_executor/models/phi4flash.py +737 -0
  1099. vllm/model_executor/models/phi4mm.py +1281 -0
  1100. vllm/model_executor/models/phi4mm_audio.py +1254 -0
  1101. vllm/model_executor/models/phi4mm_utils.py +1875 -0
  1102. vllm/model_executor/models/phimoe.py +681 -0
  1103. vllm/model_executor/models/pixtral.py +1348 -0
  1104. vllm/model_executor/models/plamo2.py +1126 -0
  1105. vllm/model_executor/models/qwen.py +363 -0
  1106. vllm/model_executor/models/qwen2.py +526 -0
  1107. vllm/model_executor/models/qwen2_5_omni_thinker.py +985 -0
  1108. vllm/model_executor/models/qwen2_5_vl.py +1256 -0
  1109. vllm/model_executor/models/qwen2_audio.py +492 -0
  1110. vllm/model_executor/models/qwen2_moe.py +558 -0
  1111. vllm/model_executor/models/qwen2_rm.py +122 -0
  1112. vllm/model_executor/models/qwen2_vl.py +1512 -0
  1113. vllm/model_executor/models/qwen3.py +344 -0
  1114. vllm/model_executor/models/qwen3_moe.py +704 -0
  1115. vllm/model_executor/models/qwen3_next.py +1298 -0
  1116. vllm/model_executor/models/qwen3_next_mtp.py +285 -0
  1117. vllm/model_executor/models/qwen_vl.py +795 -0
  1118. vllm/model_executor/models/registry.py +891 -0
  1119. vllm/model_executor/models/roberta.py +252 -0
  1120. vllm/model_executor/models/rvl.py +103 -0
  1121. vllm/model_executor/models/seed_oss.py +488 -0
  1122. vllm/model_executor/models/siglip.py +524 -0
  1123. vllm/model_executor/models/siglip2navit.py +688 -0
  1124. vllm/model_executor/models/skyworkr1v.py +914 -0
  1125. vllm/model_executor/models/smolvlm.py +44 -0
  1126. vllm/model_executor/models/solar.py +506 -0
  1127. vllm/model_executor/models/stablelm.py +344 -0
  1128. vllm/model_executor/models/starcoder2.py +357 -0
  1129. vllm/model_executor/models/step3_text.py +521 -0
  1130. vllm/model_executor/models/step3_vl.py +1091 -0
  1131. vllm/model_executor/models/swin.py +475 -0
  1132. vllm/model_executor/models/tarsier.py +649 -0
  1133. vllm/model_executor/models/telechat2.py +151 -0
  1134. vllm/model_executor/models/teleflm.py +79 -0
  1135. vllm/model_executor/models/terratorch.py +294 -0
  1136. vllm/model_executor/models/transformers.py +883 -0
  1137. vllm/model_executor/models/ultravox.py +667 -0
  1138. vllm/model_executor/models/utils.py +770 -0
  1139. vllm/model_executor/models/vision.py +125 -0
  1140. vllm/model_executor/models/voxtral.py +789 -0
  1141. vllm/model_executor/models/whisper.py +966 -0
  1142. vllm/model_executor/models/zamba2.py +1056 -0
  1143. vllm/model_executor/parameter.py +599 -0
  1144. vllm/model_executor/sampling_metadata.py +597 -0
  1145. vllm/model_executor/utils.py +97 -0
  1146. vllm/model_executor/warmup/__init__.py +0 -0
  1147. vllm/model_executor/warmup/deep_gemm_warmup.py +223 -0
  1148. vllm/model_executor/warmup/kernel_warmup.py +83 -0
  1149. vllm/multimodal/__init__.py +35 -0
  1150. vllm/multimodal/audio.py +116 -0
  1151. vllm/multimodal/base.py +219 -0
  1152. vllm/multimodal/cache.py +507 -0
  1153. vllm/multimodal/hasher.py +110 -0
  1154. vllm/multimodal/image.py +130 -0
  1155. vllm/multimodal/inputs.py +979 -0
  1156. vllm/multimodal/parse.py +496 -0
  1157. vllm/multimodal/processing.py +1921 -0
  1158. vllm/multimodal/profiling.py +313 -0
  1159. vllm/multimodal/registry.py +375 -0
  1160. vllm/multimodal/utils.py +754 -0
  1161. vllm/multimodal/video.py +312 -0
  1162. vllm/outputs.py +517 -0
  1163. vllm/platforms/__init__.py +263 -0
  1164. vllm/platforms/cpu.py +353 -0
  1165. vllm/platforms/cuda.py +731 -0
  1166. vllm/platforms/interface.py +599 -0
  1167. vllm/platforms/rocm.py +504 -0
  1168. vllm/platforms/tpu.py +236 -0
  1169. vllm/platforms/xpu.py +243 -0
  1170. vllm/plugins/__init__.py +72 -0
  1171. vllm/plugins/io_processors/__init__.py +68 -0
  1172. vllm/plugins/io_processors/interface.py +67 -0
  1173. vllm/plugins/lora_resolvers/README.md +16 -0
  1174. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1175. vllm/plugins/lora_resolvers/filesystem_resolver.py +50 -0
  1176. vllm/pooling_params.py +183 -0
  1177. vllm/profiler/__init__.py +0 -0
  1178. vllm/profiler/layerwise_profile.py +375 -0
  1179. vllm/profiler/utils.py +148 -0
  1180. vllm/py.typed +2 -0
  1181. vllm/ray/__init__.py +0 -0
  1182. vllm/ray/lazy_utils.py +22 -0
  1183. vllm/ray/ray_env.py +72 -0
  1184. vllm/reasoning/__init__.py +25 -0
  1185. vllm/reasoning/abs_reasoning_parsers.py +202 -0
  1186. vllm/reasoning/deepseek_r1_reasoning_parser.py +173 -0
  1187. vllm/reasoning/glm4_moe_reasoning_parser.py +151 -0
  1188. vllm/reasoning/gptoss_reasoning_parser.py +87 -0
  1189. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1190. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +245 -0
  1191. vllm/reasoning/mistral_reasoning_parser.py +47 -0
  1192. vllm/reasoning/qwen3_reasoning_parser.py +151 -0
  1193. vllm/reasoning/step3_reasoning_parser.py +109 -0
  1194. vllm/sampling_params.py +577 -0
  1195. vllm/scalar_type.py +349 -0
  1196. vllm/scripts.py +15 -0
  1197. vllm/sequence.py +1465 -0
  1198. vllm/tasks.py +11 -0
  1199. vllm/test_utils.py +130 -0
  1200. vllm/third_party/__init__.py +0 -0
  1201. vllm/third_party/pynvml.py +6140 -0
  1202. vllm/tracing.py +136 -0
  1203. vllm/transformers_utils/__init__.py +24 -0
  1204. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1205. vllm/transformers_utils/chat_templates/registry.py +71 -0
  1206. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1207. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1208. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1209. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1210. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1211. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1212. vllm/transformers_utils/config.py +1043 -0
  1213. vllm/transformers_utils/config_parser_base.py +20 -0
  1214. vllm/transformers_utils/configs/__init__.py +55 -0
  1215. vllm/transformers_utils/configs/arctic.py +207 -0
  1216. vllm/transformers_utils/configs/chatglm.py +72 -0
  1217. vllm/transformers_utils/configs/deepseek_vl2.py +216 -0
  1218. vllm/transformers_utils/configs/eagle.py +84 -0
  1219. vllm/transformers_utils/configs/falcon.py +90 -0
  1220. vllm/transformers_utils/configs/jais.py +238 -0
  1221. vllm/transformers_utils/configs/kimi_vl.py +37 -0
  1222. vllm/transformers_utils/configs/medusa.py +63 -0
  1223. vllm/transformers_utils/configs/midashenglm.py +101 -0
  1224. vllm/transformers_utils/configs/mistral.py +165 -0
  1225. vllm/transformers_utils/configs/mlp_speculator.py +68 -0
  1226. vllm/transformers_utils/configs/moonvit.py +33 -0
  1227. vllm/transformers_utils/configs/nemotron.py +205 -0
  1228. vllm/transformers_utils/configs/nemotron_h.py +259 -0
  1229. vllm/transformers_utils/configs/nemotron_vl.py +56 -0
  1230. vllm/transformers_utils/configs/ovis.py +176 -0
  1231. vllm/transformers_utils/configs/qwen3_next.py +275 -0
  1232. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1233. vllm/transformers_utils/configs/speculators/algos.py +32 -0
  1234. vllm/transformers_utils/configs/speculators/base.py +91 -0
  1235. vllm/transformers_utils/configs/step3_vl.py +123 -0
  1236. vllm/transformers_utils/configs/ultravox.py +120 -0
  1237. vllm/transformers_utils/detokenizer.py +169 -0
  1238. vllm/transformers_utils/detokenizer_utils.py +199 -0
  1239. vllm/transformers_utils/dynamic_module.py +60 -0
  1240. vllm/transformers_utils/processor.py +245 -0
  1241. vllm/transformers_utils/processors/__init__.py +16 -0
  1242. vllm/transformers_utils/processors/deepseek_vl2.py +363 -0
  1243. vllm/transformers_utils/processors/ovis.py +420 -0
  1244. vllm/transformers_utils/processors/ovis2_5.py +458 -0
  1245. vllm/transformers_utils/runai_utils.py +99 -0
  1246. vllm/transformers_utils/s3_utils.py +90 -0
  1247. vllm/transformers_utils/tokenizer.py +293 -0
  1248. vllm/transformers_utils/tokenizer_base.py +149 -0
  1249. vllm/transformers_utils/tokenizer_group.py +132 -0
  1250. vllm/transformers_utils/tokenizers/__init__.py +10 -0
  1251. vllm/transformers_utils/tokenizers/mistral.py +520 -0
  1252. vllm/transformers_utils/utils.py +99 -0
  1253. vllm/triton_utils/__init__.py +16 -0
  1254. vllm/triton_utils/importing.py +95 -0
  1255. vllm/usage/__init__.py +0 -0
  1256. vllm/usage/usage_lib.py +259 -0
  1257. vllm/utils/__init__.py +3438 -0
  1258. vllm/utils/deep_gemm.py +212 -0
  1259. vllm/utils/flashinfer.py +372 -0
  1260. vllm/utils/jsontree.py +90 -0
  1261. vllm/utils/tensor_schema.py +236 -0
  1262. vllm/v1/__init__.py +0 -0
  1263. vllm/v1/attention/__init__.py +0 -0
  1264. vllm/v1/attention/backends/__init__.py +0 -0
  1265. vllm/v1/attention/backends/cpu_attn.py +922 -0
  1266. vllm/v1/attention/backends/flash_attn.py +800 -0
  1267. vllm/v1/attention/backends/flashinfer.py +1128 -0
  1268. vllm/v1/attention/backends/flex_attention.py +796 -0
  1269. vllm/v1/attention/backends/gdn_attn.py +320 -0
  1270. vllm/v1/attention/backends/linear_attn.py +68 -0
  1271. vllm/v1/attention/backends/mamba1_attn.py +81 -0
  1272. vllm/v1/attention/backends/mamba2_attn.py +224 -0
  1273. vllm/v1/attention/backends/mamba_attn.py +52 -0
  1274. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1275. vllm/v1/attention/backends/mla/common.py +1608 -0
  1276. vllm/v1/attention/backends/mla/cutlass_mla.py +301 -0
  1277. vllm/v1/attention/backends/mla/flashattn_mla.py +273 -0
  1278. vllm/v1/attention/backends/mla/flashinfer_mla.py +110 -0
  1279. vllm/v1/attention/backends/mla/flashmla.py +213 -0
  1280. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +255 -0
  1281. vllm/v1/attention/backends/mla/triton_mla.py +175 -0
  1282. vllm/v1/attention/backends/pallas.py +413 -0
  1283. vllm/v1/attention/backends/rocm_aiter_fa.py +548 -0
  1284. vllm/v1/attention/backends/short_conv_attn.py +82 -0
  1285. vllm/v1/attention/backends/tree_attn.py +450 -0
  1286. vllm/v1/attention/backends/triton_attn.py +430 -0
  1287. vllm/v1/attention/backends/utils.py +834 -0
  1288. vllm/v1/attention/backends/xformers.py +437 -0
  1289. vllm/v1/core/__init__.py +0 -0
  1290. vllm/v1/core/block_pool.py +330 -0
  1291. vllm/v1/core/encoder_cache_manager.py +333 -0
  1292. vllm/v1/core/kv_cache_coordinator.py +440 -0
  1293. vllm/v1/core/kv_cache_manager.py +398 -0
  1294. vllm/v1/core/kv_cache_utils.py +1169 -0
  1295. vllm/v1/core/sched/__init__.py +0 -0
  1296. vllm/v1/core/sched/async_scheduler.py +47 -0
  1297. vllm/v1/core/sched/interface.py +158 -0
  1298. vllm/v1/core/sched/output.py +162 -0
  1299. vllm/v1/core/sched/request_queue.py +224 -0
  1300. vllm/v1/core/sched/scheduler.py +1287 -0
  1301. vllm/v1/core/sched/utils.py +69 -0
  1302. vllm/v1/core/single_type_kv_cache_manager.py +670 -0
  1303. vllm/v1/cudagraph_dispatcher.py +121 -0
  1304. vllm/v1/engine/__init__.py +202 -0
  1305. vllm/v1/engine/async_llm.py +757 -0
  1306. vllm/v1/engine/coordinator.py +357 -0
  1307. vllm/v1/engine/core.py +1245 -0
  1308. vllm/v1/engine/core_client.py +1333 -0
  1309. vllm/v1/engine/detokenizer.py +300 -0
  1310. vllm/v1/engine/exceptions.py +17 -0
  1311. vllm/v1/engine/llm_engine.py +332 -0
  1312. vllm/v1/engine/logprobs.py +201 -0
  1313. vllm/v1/engine/output_processor.py +558 -0
  1314. vllm/v1/engine/parallel_sampling.py +133 -0
  1315. vllm/v1/engine/processor.py +524 -0
  1316. vllm/v1/engine/utils.py +857 -0
  1317. vllm/v1/executor/__init__.py +0 -0
  1318. vllm/v1/executor/abstract.py +126 -0
  1319. vllm/v1/executor/multiproc_executor.py +683 -0
  1320. vllm/v1/executor/ray_distributed_executor.py +109 -0
  1321. vllm/v1/kv_cache_interface.py +275 -0
  1322. vllm/v1/metrics/__init__.py +0 -0
  1323. vllm/v1/metrics/loggers.py +717 -0
  1324. vllm/v1/metrics/prometheus.py +82 -0
  1325. vllm/v1/metrics/ray_wrappers.py +133 -0
  1326. vllm/v1/metrics/reader.py +246 -0
  1327. vllm/v1/metrics/stats.py +248 -0
  1328. vllm/v1/outputs.py +147 -0
  1329. vllm/v1/pool/__init__.py +0 -0
  1330. vllm/v1/pool/metadata.py +77 -0
  1331. vllm/v1/request.py +237 -0
  1332. vllm/v1/sample/__init__.py +0 -0
  1333. vllm/v1/sample/logits_processor/__init__.py +294 -0
  1334. vllm/v1/sample/logits_processor/builtin.py +273 -0
  1335. vllm/v1/sample/logits_processor/interface.py +97 -0
  1336. vllm/v1/sample/logits_processor/state.py +161 -0
  1337. vllm/v1/sample/metadata.py +43 -0
  1338. vllm/v1/sample/ops/__init__.py +0 -0
  1339. vllm/v1/sample/ops/bad_words.py +39 -0
  1340. vllm/v1/sample/ops/logprobs.py +26 -0
  1341. vllm/v1/sample/ops/penalties.py +43 -0
  1342. vllm/v1/sample/ops/topk_topp_sampler.py +254 -0
  1343. vllm/v1/sample/rejection_sampler.py +623 -0
  1344. vllm/v1/sample/sampler.py +281 -0
  1345. vllm/v1/sample/tpu/__init__.py +0 -0
  1346. vllm/v1/sample/tpu/metadata.py +124 -0
  1347. vllm/v1/sample/tpu/sampler.py +213 -0
  1348. vllm/v1/serial_utils.py +395 -0
  1349. vllm/v1/spec_decode/__init__.py +0 -0
  1350. vllm/v1/spec_decode/eagle.py +740 -0
  1351. vllm/v1/spec_decode/medusa.py +66 -0
  1352. vllm/v1/spec_decode/metadata.py +62 -0
  1353. vllm/v1/spec_decode/metrics.py +191 -0
  1354. vllm/v1/spec_decode/ngram_proposer.py +157 -0
  1355. vllm/v1/spec_decode/utils.py +14 -0
  1356. vllm/v1/structured_output/__init__.py +297 -0
  1357. vllm/v1/structured_output/backend_guidance.py +245 -0
  1358. vllm/v1/structured_output/backend_lm_format_enforcer.py +167 -0
  1359. vllm/v1/structured_output/backend_outlines.py +320 -0
  1360. vllm/v1/structured_output/backend_types.py +134 -0
  1361. vllm/v1/structured_output/backend_xgrammar.py +323 -0
  1362. vllm/v1/structured_output/request.py +86 -0
  1363. vllm/v1/structured_output/utils.py +373 -0
  1364. vllm/v1/utils.py +382 -0
  1365. vllm/v1/worker/__init__.py +0 -0
  1366. vllm/v1/worker/block_table.py +221 -0
  1367. vllm/v1/worker/cpu_model_runner.py +163 -0
  1368. vllm/v1/worker/cpu_worker.py +183 -0
  1369. vllm/v1/worker/gpu_input_batch.py +821 -0
  1370. vllm/v1/worker/gpu_model_runner.py +3743 -0
  1371. vllm/v1/worker/gpu_worker.py +697 -0
  1372. vllm/v1/worker/kv_connector_model_runner_mixin.py +122 -0
  1373. vllm/v1/worker/lora_model_runner_mixin.py +192 -0
  1374. vllm/v1/worker/tpu_input_batch.py +585 -0
  1375. vllm/v1/worker/tpu_model_runner.py +1947 -0
  1376. vllm/v1/worker/tpu_worker.py +340 -0
  1377. vllm/v1/worker/utils.py +290 -0
  1378. vllm/v1/worker/worker_base.py +65 -0
  1379. vllm/v1/worker/xpu_model_runner.py +53 -0
  1380. vllm/v1/worker/xpu_worker.py +179 -0
  1381. vllm/version.py +41 -0
  1382. vllm/vllm_flash_attn/.gitkeep +0 -0
  1383. vllm/worker/__init__.py +0 -0
  1384. vllm/worker/cache_engine.py +145 -0
  1385. vllm/worker/enc_dec_model_runner.py +553 -0
  1386. vllm/worker/model_runner.py +2016 -0
  1387. vllm/worker/model_runner_base.py +307 -0
  1388. vllm/worker/utils.py +49 -0
  1389. vllm/worker/worker.py +670 -0
  1390. vllm/worker/worker_base.py +651 -0
  1391. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/METADATA +326 -0
  1392. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/RECORD +1395 -0
  1393. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/WHEEL +5 -0
  1394. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/entry_points.txt +5 -0
  1395. vllm_cpu_avx512vnni-0.10.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2651 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """
4
+ This module defines a framework for sampling benchmark requests from various
5
+ datasets. Each dataset subclass of BenchmarkDataset must implement sample
6
+ generation. Supported dataset types include:
7
+ - ShareGPT
8
+ - Random (synthetic)
9
+ - Sonnet
10
+ - BurstGPT
11
+ - HuggingFace
12
+ - VisionArena
13
+ """
14
+ import ast
15
+ import base64
16
+ import io
17
+ import json
18
+ import logging
19
+ import math
20
+ import random
21
+ from abc import ABC, abstractmethod
22
+ from collections.abc import Iterator, Mapping
23
+ from contextlib import suppress
24
+ from copy import deepcopy
25
+ from dataclasses import dataclass
26
+ from functools import cache
27
+ from io import BytesIO
28
+ from typing import Any, Callable, Optional, Union, cast
29
+
30
+ import numpy as np
31
+ from PIL import Image
32
+ from transformers import PreTrainedTokenizerBase
33
+ from typing_extensions import deprecated
34
+
35
+ from vllm.lora.request import LoRARequest
36
+ from vllm.lora.utils import get_adapter_absolute_path
37
+ from vllm.multimodal import MultiModalDataDict
38
+ from vllm.multimodal.image import convert_image_mode
39
+ from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
40
+ from vllm.utils import PlaceholderModule
41
+
42
+ try:
43
+ from datasets import load_dataset
44
+ except ImportError:
45
+ datasets = PlaceholderModule("datasets")
46
+ load_dataset = datasets.placeholder_attr("load_dataset")
47
+
48
+ try:
49
+ import pandas as pd
50
+ except ImportError:
51
+ pd = PlaceholderModule("pandas")
52
+
53
+ try:
54
+ import librosa
55
+ except ImportError:
56
+ librosa = PlaceholderModule("librosa")
57
+
58
+ try:
59
+ from vllm.utils import FlexibleArgumentParser
60
+ except ImportError:
61
+ from argparse import ArgumentParser as FlexibleArgumentParser
62
+
63
+ logger = logging.getLogger(__name__)
64
+
65
+ # -----------------------------------------------------------------------------
66
+ # Data Classes
67
+ # -----------------------------------------------------------------------------
68
+
69
+
70
+ @dataclass
71
+ class SampleRequest:
72
+ """
73
+ Represents a single inference request for benchmarking.
74
+ """
75
+
76
+ prompt: Union[str, list[str]]
77
+ prompt_len: int
78
+ expected_output_len: int
79
+ multi_modal_data: Optional[
80
+ Union[MultiModalDataDict, dict, list[dict]]
81
+ ] = None
82
+ lora_request: Optional[LoRARequest] = None
83
+ request_id: Optional[str] = None
84
+
85
+
86
+ # -----------------------------------------------------------------------------
87
+ # Benchmark Dataset Base Class
88
+ # -----------------------------------------------------------------------------
89
+
90
+
91
+ class BenchmarkDataset(ABC):
92
+ DEFAULT_SEED = 0
93
+ IS_MULTIMODAL = False
94
+
95
+ def __init__(
96
+ self,
97
+ dataset_path: Optional[str] = None,
98
+ random_seed: int = DEFAULT_SEED,
99
+ ) -> None:
100
+ """
101
+ Initialize the BenchmarkDataset with an optional dataset path and random
102
+ seed.
103
+
104
+ Args:
105
+ dataset_path (Optional[str]): Path to the dataset. If None, it
106
+ indicates that a default or random dataset might be used.
107
+ random_seed (int): Seed value for reproducible shuffling or
108
+ sampling. Defaults to DEFAULT_SEED.
109
+ """
110
+ self.dataset_path = dataset_path
111
+ # Set the random seed, ensuring that a None value is replaced with the
112
+ # default seed.
113
+ self.random_seed = (random_seed
114
+ if random_seed is not None else self.DEFAULT_SEED)
115
+ self.data = None
116
+
117
+ def apply_multimodal_chat_transformation(
118
+ self,
119
+ prompt: str,
120
+ mm_content: Optional[
121
+ Union[MultiModalDataDict, dict, list[dict]]
122
+ ] = None) -> list[dict]:
123
+ """
124
+ Transform a prompt and optional multimodal content into a chat format.
125
+ This method is used for chat models that expect a specific conversation
126
+ format.
127
+ """
128
+ content = [{"text": prompt, "type": "text"}]
129
+ if mm_content is not None:
130
+ if isinstance(mm_content, list):
131
+ content.extend(cast(list[dict[str, Any]], mm_content))
132
+ elif isinstance(mm_content, dict):
133
+ content.append(mm_content)
134
+ else:
135
+ raise TypeError(
136
+ "Could not process multimodal content of type: " +
137
+ f"{type(mm_content)}"
138
+ )
139
+ return [{"role": "user", "content": content}]
140
+
141
+ def load_data(self) -> None:
142
+ """
143
+ Load data from the dataset path into self.data.
144
+
145
+ This method must be overridden by subclasses since the method to load
146
+ data will vary depending on the dataset format and source.
147
+
148
+ Raises:
149
+ NotImplementedError: If a subclass does not implement this method.
150
+ """
151
+ # TODO (jenniferzhao): add support for downloading data
152
+ raise NotImplementedError(
153
+ "load_data must be implemented in subclasses.")
154
+
155
+ def get_random_lora_request(
156
+ self,
157
+ tokenizer: PreTrainedTokenizerBase,
158
+ max_loras: Optional[int] = None,
159
+ lora_path: Optional[str] = None,
160
+ ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
161
+ """
162
+ Optionally select a random LoRA request and return its associated
163
+ tokenizer.
164
+
165
+ This method is used when LoRA parameters are provided. It randomly
166
+ selects a LoRA based on max_loras and retrieves a cached tokenizer for
167
+ that LoRA if available. Otherwise, it returns the base tokenizer.
168
+
169
+ Args:
170
+ tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
171
+ LoRA is selected.
172
+ max_loras (Optional[int]): The maximum number of LoRAs available.
173
+ If `None`, LoRA is not used.
174
+ lora_path (Optional[str]): Path to the LoRA parameters on disk.
175
+ If `None`, LoRA is not used.
176
+
177
+ Returns:
178
+ A tuple with the following elements:
179
+ - A new [LoRARequest][] (or `None` if not applicable).
180
+ - The tokenizer associated with the LoRA request
181
+ (or the base tokenizer).
182
+ """
183
+ if max_loras is None or lora_path is None:
184
+ return None, tokenizer
185
+
186
+ # Generate a random LoRA ID in the range [1, max_loras].
187
+ lora_id = random.randint(1, max_loras)
188
+ lora_request = LoRARequest(
189
+ lora_name=str(lora_id),
190
+ lora_int_id=lora_id,
191
+ lora_path=lora_path_on_disk(lora_path),
192
+ )
193
+ if lora_id not in lora_tokenizer_cache:
194
+ lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
195
+ # Return lora_request and the cached tokenizer if available; otherwise,
196
+ # return the base tokenizer
197
+ return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
198
+
199
+ @abstractmethod
200
+ def sample(self, tokenizer: PreTrainedTokenizerBase,
201
+ num_requests: int,
202
+ request_id_prefix: str = "",
203
+ no_oversample: bool = False) -> list[SampleRequest]:
204
+ """
205
+ Abstract method to generate sample requests from the dataset.
206
+
207
+ Subclasses must override this method to implement dataset-specific logic
208
+ for generating a list of SampleRequest objects.
209
+
210
+ Args:
211
+ tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
212
+ for processing the dataset's text.
213
+ num_requests (int): The number of sample requests to generate.
214
+ request_id_prefix (str) The prefix of request_id.
215
+
216
+
217
+ Returns:
218
+ list[SampleRequest]: A list of sample requests generated from the
219
+ dataset.
220
+ """
221
+ raise NotImplementedError("sample must be implemented in subclasses.")
222
+
223
+ def maybe_oversample_requests(
224
+ self,
225
+ requests: list[SampleRequest],
226
+ num_requests: int,
227
+ request_id_prefix: str = "",
228
+ no_oversample: bool = False,
229
+ ) -> None:
230
+ """
231
+ Oversamples the list of requests if its size is less than the desired
232
+ number.
233
+
234
+ Args:
235
+ requests (List[SampleRequest]): The current list of sampled
236
+ requests.
237
+ num_requests (int): The target number of requests.
238
+ request_id_prefix (str) The prefix of the request ids.
239
+
240
+ """
241
+ if no_oversample:
242
+ logger.info("Skipping oversampling. " \
243
+ "Total samples: %d.", len(requests))
244
+ return
245
+
246
+ if len(requests) < num_requests:
247
+ random.seed(self.random_seed)
248
+ additional = deepcopy(
249
+ random.choices(requests, k=num_requests - len(requests))
250
+ )
251
+ for i in range(len(additional)):
252
+ req = additional[i]
253
+ req.request_id = request_id_prefix + str(len(requests) + i)
254
+ requests.extend(additional)
255
+ logger.info("Oversampled requests to reach %d total samples.",
256
+ num_requests)
257
+
258
+
259
+ # -----------------------------------------------------------------------------
260
+ # Utility Functions and Global Caches
261
+ # -----------------------------------------------------------------------------
262
+
263
+
264
+ def is_valid_sequence(
265
+ prompt_len: int,
266
+ output_len: int,
267
+ min_len: int = 4,
268
+ max_prompt_len: int = 1024,
269
+ max_total_len: int = 2048,
270
+ skip_min_output_len_check: bool = False,
271
+ ) -> bool:
272
+ """
273
+ Validate a sequence based on prompt and output lengths.
274
+
275
+ Default pruning criteria are copied from the original `sample_hf_requests`
276
+ and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
277
+ from `sample_requests` in benchmark_throughput.py.
278
+ """
279
+ # Check for invalid conditions
280
+ prompt_too_short = prompt_len < min_len
281
+ output_too_short = (not skip_min_output_len_check) and (output_len
282
+ < min_len)
283
+ prompt_too_long = prompt_len > max_prompt_len
284
+ combined_too_long = (prompt_len + output_len) > max_total_len
285
+
286
+ # Return True if none of the invalid conditions are met
287
+ return not (prompt_too_short or output_too_short or prompt_too_long
288
+ or combined_too_long)
289
+
290
+
291
+ @cache
292
+ def lora_path_on_disk(lora_path: str) -> str:
293
+ return get_adapter_absolute_path(lora_path)
294
+
295
+
296
+ # Global cache for LoRA tokenizers.
297
+ lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
298
+
299
+
300
+ def process_image(image: Any) -> Mapping[str, Any]:
301
+ """
302
+ Process a single image input and return a multimedia content dictionary.
303
+
304
+ Supports the following input types:
305
+
306
+ 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
307
+ containing raw image data. - Loads the bytes as a PIL.Image.Image.
308
+
309
+ 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
310
+ a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
311
+ a dictionary with the image as a base64 data URL.
312
+
313
+ 3. String input: - Treats the string as a URL or local file path. -
314
+ Prepends "file://" if the string doesn't start with "http://" or
315
+ "file://". - Returns a dictionary with the image URL.
316
+
317
+ Raises:
318
+ ValueError: If the input is not a supported type.
319
+ """
320
+ if isinstance(image, dict) and 'bytes' in image:
321
+ image = Image.open(BytesIO(image['bytes']))
322
+ if isinstance(image, Image.Image):
323
+ image = convert_image_mode(image, "RGB")
324
+ with io.BytesIO() as image_data:
325
+ image.save(image_data, format="JPEG")
326
+ image_base64 = base64.b64encode(
327
+ image_data.getvalue()).decode("utf-8")
328
+ return {
329
+ "type": "image_url",
330
+ "image_url": {
331
+ "url": f"data:image/jpeg;base64,{image_base64}"
332
+ },
333
+ }
334
+
335
+ if isinstance(image, str):
336
+ image_url = (image if image.startswith(
337
+ ("http://", "file://")) else f"file://{image}")
338
+ return {"type": "image_url", "image_url": {"url": image_url}}
339
+
340
+ raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
341
+ " or str or dictionary with raw image bytes.")
342
+
343
+
344
+ def process_video(video: Any) -> Mapping[str, Any]:
345
+ """
346
+ Process a single video input and return a multimedia content dictionary.
347
+
348
+ Supports the following input types:
349
+
350
+ 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
351
+ containing raw video data.
352
+
353
+ 2. String input: - Treats the string as a URL or local file path. -
354
+ Prepends "file://" if the string doesn't start with "http://" or
355
+ "file://". - Returns a dictionary with the image URL.
356
+
357
+ Raises:
358
+ ValueError: If the input is not a supported type.
359
+ """
360
+ if isinstance(video, dict) and 'bytes' in video:
361
+ video_bytes = video['bytes']
362
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
363
+ return {
364
+ "type": "video_url",
365
+ "video_url": {
366
+ "url": f"data:video/mp4;base64,{video_base64}"
367
+ },
368
+ }
369
+
370
+ if isinstance(video, str):
371
+ video_url = (video if video.startswith(
372
+ ("http://", "file://")) else f"file://{video}")
373
+ return {"type": "video_url", "video_url": {"url": video_url}}
374
+
375
+ raise ValueError(
376
+ f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
377
+ )
378
+
379
+ # -----------------------------------------------------------------------------
380
+ # Random Dataset Implementation (Synthetic Data)
381
+ # -----------------------------------------------------------------------------
382
+
383
+
384
+ class RandomDataset(BenchmarkDataset):
385
+ """
386
+ Synthetic text-only dataset for serving/throughput benchmarks.
387
+
388
+ Strategy:
389
+ - Sample input/output token lengths per request from integer-uniform ranges
390
+ around configured means (controlled by range_ratio).
391
+ - Prepend a fixed random prefix of length prefix_len.
392
+ - Generate the remaining tokens as a reproducible sequence:
393
+ (offset + index + arange(input_len)) % vocab_size.
394
+ - Decode then re-encode/truncate to ensure prompt token counts match.
395
+ - Uses numpy.default_rng seeded with random_seed for reproducible sampling.
396
+ """
397
+ # Default values copied from benchmark_serving.py for the random dataset.
398
+ DEFAULT_PREFIX_LEN = 0
399
+ DEFAULT_RANGE_RATIO = 0.0
400
+ DEFAULT_INPUT_LEN = 1024
401
+ DEFAULT_OUTPUT_LEN = 128
402
+
403
+ def __init__(self, **kwargs) -> None:
404
+ super().__init__(**kwargs)
405
+ # Use numpy's default_rng for deterministic sampling
406
+ # Do not use random.seed() or np.random.seed() elsewhere in this class.
407
+ # This ensures that the RNG is isolated from global RNG state.
408
+ self._rng = np.random.default_rng(self.random_seed)
409
+
410
+ def sample(
411
+ self,
412
+ tokenizer: PreTrainedTokenizerBase,
413
+ num_requests: int,
414
+ request_id_prefix: str = "",
415
+ no_oversample: bool = False,
416
+ prefix_len: int = DEFAULT_PREFIX_LEN,
417
+ range_ratio: float = DEFAULT_RANGE_RATIO,
418
+ input_len: int = DEFAULT_INPUT_LEN,
419
+ output_len: int = DEFAULT_OUTPUT_LEN,
420
+ batchsize: int = 1,
421
+ **kwargs,
422
+ ) -> list[SampleRequest]:
423
+
424
+ input_lens, output_lens, offsets = self.get_sampling_params(
425
+ num_requests, range_ratio, input_len, output_len, tokenizer
426
+ )
427
+
428
+ # Generate prefix once
429
+ prefix_token_ids = self.get_prefix(tokenizer, prefix_len)
430
+ vocab_size = tokenizer.vocab_size
431
+
432
+ requests = []
433
+ for i in range(num_requests):
434
+ prompt, total_input_len = self.generate_token_sequence(
435
+ tokenizer=tokenizer,
436
+ prefix_token_ids=prefix_token_ids,
437
+ prefix_len=prefix_len,
438
+ vocab_size=vocab_size,
439
+ input_len=int(input_lens[i]),
440
+ offset=int(offsets[i]),
441
+ index=i,
442
+ )
443
+ requests.append(
444
+ SampleRequest(
445
+ prompt=prompt,
446
+ prompt_len=total_input_len,
447
+ expected_output_len=int(output_lens[i]),
448
+ request_id=request_id_prefix + str(i),
449
+ )
450
+ )
451
+ # only used for embeddings benchmark.
452
+ if batchsize > 1:
453
+ batch_requests = []
454
+ # Create batched requests
455
+ for i in range(0, num_requests, batchsize):
456
+ batch = requests[i : i + batchsize]
457
+ batch_requests.append(
458
+ SampleRequest(
459
+ prompt=[req.prompt for req in batch],
460
+ prompt_len=sum(req.prompt_len for req in batch),
461
+ expected_output_len=0,
462
+ request_id=request_id_prefix + str(i // batchsize),
463
+ )
464
+ )
465
+ requests = batch_requests
466
+ return requests
467
+
468
+ def get_prefix(
469
+ self, tokenizer: PreTrainedTokenizerBase, prefix_len: int
470
+ ) -> list[int]:
471
+ """
472
+ Get the prefix for the dataset.
473
+ """
474
+ return (
475
+ self._rng.integers(
476
+ 0, tokenizer.vocab_size, size=prefix_len).tolist()
477
+ if prefix_len > 0
478
+ else []
479
+ )
480
+
481
+ def get_sampling_params(
482
+ self,
483
+ num_requests: int,
484
+ range_ratio: float,
485
+ input_len: int,
486
+ output_len: int,
487
+ tokenizer: PreTrainedTokenizerBase,
488
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
489
+ """
490
+ Get the sampling parameters for the dataset.
491
+ """
492
+ # Enforce range_ratio < 1
493
+ if not (0.0 <= range_ratio < 1.0):
494
+ raise ValueError("range_ratio must be in [0, 1).")
495
+ num_special_tokens = int(tokenizer.num_special_tokens_to_add())
496
+ real_input_len = max(0, int(input_len) - num_special_tokens)
497
+ # Bounds use floor for low and ceil for high
498
+ input_low = math.floor(real_input_len * (1 - range_ratio))
499
+ input_high = math.ceil(real_input_len * (1 + range_ratio))
500
+ output_low = math.floor(output_len * (1 - range_ratio))
501
+ output_high = math.ceil(output_len * (1 + range_ratio))
502
+ # Ensure the lower bound for output length is at least 1 to
503
+ # prevent sampling 0 tokens.
504
+ output_low = max(output_low, 1)
505
+
506
+ if input_low > input_high:
507
+ raise ValueError(
508
+ "Invalid input sampling interval: "
509
+ f"low={input_low} > high={input_high}"
510
+ )
511
+ if output_low > output_high:
512
+ raise ValueError(
513
+ "Invalid output sampling interval: "
514
+ f"low={output_low} > high={output_high}"
515
+ )
516
+
517
+ logger.info(
518
+ "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
519
+ input_low,
520
+ input_high,
521
+ output_low,
522
+ output_high,
523
+ )
524
+
525
+ input_lens = self._rng.integers(input_low, input_high + 1,
526
+ size=num_requests)
527
+ output_lens = self._rng.integers(output_low, output_high + 1,
528
+ size=num_requests)
529
+ offsets = self._rng.integers(0, tokenizer.vocab_size,
530
+ size=num_requests)
531
+ return input_lens, output_lens, offsets
532
+
533
+ def generate_token_sequence(
534
+ self,
535
+ *,
536
+ tokenizer: PreTrainedTokenizerBase,
537
+ prefix_token_ids: list[int],
538
+ prefix_len: int,
539
+ vocab_size: int,
540
+ input_len: int,
541
+ offset: int,
542
+ index: int,
543
+ ) -> tuple[str, int]:
544
+ """
545
+ Returns (prompt, total_input_len).
546
+
547
+ NOTE: After decoding the prompt we have to encode and decode it again.
548
+ This is done because in some cases N consecutive tokens
549
+ give a string tokenized into != N number of tokens.
550
+ For example for GPT2Tokenizer:
551
+ [6880, 6881] -> ['Ġcalls', 'here'] ->
552
+ [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
553
+ To avoid uncontrolled change of the prompt length,
554
+ the encoded sequence is truncated before being decode again.
555
+ """
556
+ # Build the inner sequence by sampling sequentially from the vocab
557
+ inner_seq = ((offset + index + np.arange(input_len))
558
+ % vocab_size).tolist()
559
+ token_sequence = prefix_token_ids + inner_seq
560
+
561
+ # Decode, then re-encode and truncate to preserve token count invariants
562
+ prompt = tokenizer.decode(token_sequence)
563
+ total_input_len = prefix_len + int(input_len)
564
+
565
+ re_encoded_sequence = tokenizer.encode(
566
+ prompt, add_special_tokens=False)[:total_input_len]
567
+ prompt = tokenizer.decode(re_encoded_sequence)
568
+ total_input_len = len(re_encoded_sequence)
569
+
570
+ return prompt, total_input_len
571
+
572
+
573
+ # -----------------------------------------------------------------------------
574
+ # MultiModalDataset Implementation
575
+ # -----------------------------------------------------------------------------
576
+
577
+ class RandomMultiModalDataset(RandomDataset):
578
+ """
579
+ Synthetic multimodal dataset (text + images) that extends RandomDataset.
580
+
581
+ Status:
582
+ - Images: supported via synthetic RGB data.
583
+ - Video: not yet supported (TODO: implement video generation method).
584
+ - Audio: not yet supported.
585
+
586
+ Sampling overview:
587
+ 1) Number of items per request is sampled uniformly from the integer range
588
+ [floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is
589
+ `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0.
590
+ The maximum is further clamped to the sum of per-modality limits.
591
+ 2) Each item’s modality and shape is sampled from `bucket_config`, a dict
592
+ mapping (height, width, num_frames) → probability. We treat
593
+ `num_frames`=1 as image and and `num_frames` > 1 as video.
594
+ Entries with zero probability are removed and the rest are renormalized
595
+ to sum to 1.
596
+ 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
597
+ When a modality reaches its cap, all of its buckets are excluded and the
598
+ remaining probabilities are renormalized.
599
+
600
+ Example bucket configuration:
601
+ {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1}
602
+ - Two image buckets (`num_frames`=1) and one video bucket
603
+ (`num_frames`=16).
604
+ OBS.: Only image sampling is supported for now.
605
+ """
606
+
607
+ IS_MULTIMODAL = True
608
+ # NOTE: video sampling is WIP. Setting it to 0.
609
+ DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 0}
610
+
611
+ DEFAULT_BASE_ITEMS_PER_REQUEST = 1
612
+ DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0
613
+ DEFAULT_MM_ITEM_BUCKET_CONFIG = {
614
+ (256, 256, 1): 0.5,
615
+ (720, 1280, 1): 0.5,
616
+ (720, 1280, 16): 0.0,
617
+ }
618
+ DEFAULT_ENABLE_MULTIMODAL_CHAT = False
619
+
620
+ def __init__(self, **kwargs) -> None:
621
+ super().__init__(**kwargs)
622
+
623
+
624
+ def generate_synthetic_image(self, width: int, height: int) -> Image.Image:
625
+ """Generate synthetic PIL image with random RGB values.
626
+
627
+ NOTE: iid pixel sampling results in worst-case compression
628
+ (good for stressing I/O), but very unlike real photos.
629
+ We could consider a “low-freq” mode (e.g., noise blur)
630
+ to emulate network realism instead of max stress.
631
+ """
632
+ random_pixels = self._rng.integers(
633
+ 0,
634
+ 256,
635
+ (height, width, 3),
636
+ dtype=np.uint8,
637
+ )
638
+ return Image.fromarray(random_pixels)
639
+
640
+ def generate_synthetic_video(self, width: int,
641
+ height: int,
642
+ num_frames: int) -> Any:
643
+ """Generate synthetic video with random values.
644
+
645
+ TODO: Finish this method.
646
+ """
647
+ raise NotImplementedError("Video sampling is WIP.")
648
+
649
+ def map_config_to_modality(self, config: tuple[int, int, int]) -> str:
650
+ """Map the configuration to the modality."""
651
+ if config[-1] == 1:
652
+ return "image"
653
+ elif config[-1] > 1:
654
+ return "video"
655
+ else:
656
+ raise ValueError(f"Invalid multimodal item configuration: {config}")
657
+
658
+ def normalize_bucket_config(self, bucket_config: dict[tuple[int, int, int],
659
+ float]) -> dict[tuple[int, int, int], float]:
660
+ """
661
+ Remove zero probability entries
662
+ and normalize the bucket config to sum to 1.
663
+ """
664
+ # Raise error if value is negative
665
+ if any(v < 0 for v in bucket_config.values()):
666
+ raise ValueError("Bucket config values must be non-negative.")
667
+ # Remove zero probability entries
668
+ bucket_config = {k: v for k, v in bucket_config.items() if v > 0}
669
+ # if bucket config is empty, raise error
670
+ if not bucket_config:
671
+ raise ValueError("Got invalid bucket config. "
672
+ "Bucket config values must be non-zero.")
673
+ # Normalize the remaining bucket config to sum to 1
674
+ total = sum(bucket_config.values())
675
+ return {k: v / total for k, v in bucket_config.items()}
676
+
677
+
678
+ def generate_mm_item(self,
679
+ mm_item_config: tuple[int, int, int],
680
+ ) -> Mapping[str, Any]:
681
+ """
682
+ Create synthetic images and videos and
683
+ apply process_image/process_video respectively.
684
+ This follows the OpenAI API chat completions
685
+ https://github.com/openai/openai-python
686
+ """
687
+
688
+ if self.map_config_to_modality(mm_item_config) == "image":
689
+ return process_image(self.generate_synthetic_image(
690
+ mm_item_config[1],
691
+ mm_item_config[0]))
692
+ elif self.map_config_to_modality(mm_item_config) == "video":
693
+ return process_video(self.generate_synthetic_video(
694
+ mm_item_config[1],
695
+ mm_item_config[0],
696
+ mm_item_config[2]))
697
+ else:
698
+ raise ValueError(f"Invalid multimodal item configuration: "
699
+ f"{mm_item_config}")
700
+
701
+
702
+ def get_mm_item_sampling_params(
703
+ self,
704
+ base_items_per_request: int,
705
+ num_mm_items_range_ratio: float,
706
+ limit_mm_per_prompt: dict[str, int],
707
+ bucket_config: dict[tuple[int, int, int], float],
708
+ ) -> tuple[int, int, dict[str, int], dict[tuple[int, int, int], float]]:
709
+ """
710
+ Get the sampling parameters for the multimodal items.
711
+ """
712
+ # Enforce num_mm_items_range_ratio <= 1
713
+ if not (0.0 <= num_mm_items_range_ratio <= 1.0):
714
+ raise ValueError("num_mm_items_range_ratio must be in [0, 1].")
715
+
716
+ # Ensure modalities to sample are in limit_mm_per_prompt
717
+ for k, v in bucket_config.items():
718
+ # get modality from bucket config
719
+ modality = self.map_config_to_modality(k)
720
+ if modality not in limit_mm_per_prompt:
721
+ raise ValueError(f"Modality {modality} is not in "
722
+ f"limit_mm_per_prompt: "
723
+ f"{limit_mm_per_prompt.keys()}")
724
+
725
+ # Remove zero probability entries
726
+ # and normalize bucket config to sum to 1
727
+ bucket_config = self.normalize_bucket_config(bucket_config)
728
+ logger.info(
729
+ "Normalized bucket config: %s", bucket_config,
730
+ )
731
+ # Only consider limit per prompt for modalities in bucket config
732
+ allowed_modalities = {self.map_config_to_modality(cfg)
733
+ for cfg in bucket_config}
734
+ limit_mm_per_prompt = {
735
+ k: v for k, v in limit_mm_per_prompt.items()
736
+ if k in allowed_modalities}
737
+ if not limit_mm_per_prompt:
738
+ raise ValueError("No valid limits for modalities present in "
739
+ "bucket_config.")
740
+
741
+ logger.info(
742
+ "Updated mm-limit-per-prompt: %s", limit_mm_per_prompt,
743
+ )
744
+
745
+ # Get max and min num mm items and ensure
746
+ # it is at most the sum of limit_mm_per_prompt for all modalities
747
+ max_num_mm_items = min(
748
+ sum(limit_mm_per_prompt.values()),
749
+ math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio))
750
+ )
751
+ # Ensure min num mm items is at least 0
752
+ min_num_mm_items = max(
753
+ 0,
754
+ math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
755
+ )
756
+ # Raise error if min num mm items is greater than max num mm items
757
+ if min_num_mm_items > max_num_mm_items:
758
+ raise ValueError(f"Min num mm items is greater than max mm items: "
759
+ f"{min_num_mm_items} > {max_num_mm_items}")
760
+
761
+ logger.info(
762
+ "Sampling number of multimodal items from [%s, %s]",
763
+ min_num_mm_items, max_num_mm_items,
764
+ )
765
+
766
+ return (
767
+ min_num_mm_items,
768
+ max_num_mm_items,
769
+ limit_mm_per_prompt,
770
+ bucket_config,
771
+ )
772
+
773
+ def get_mm_item_iterator(
774
+ self,
775
+ min_num_mm_items: int,
776
+ max_num_mm_items: int,
777
+ bucket_config: dict[tuple[int, int, int], float],
778
+ limit_mm_per_prompt: dict[str, int],
779
+ ) -> Iterator[tuple[int,int, int]]:
780
+ """
781
+ Iterator over the multimodal items for each request
782
+ whose size is between min_num_mm_items and max_num_mm_items.
783
+
784
+ Loop over the bucket config and sample a multimodal item.
785
+ Loop until the number of multimodal items sampled is equal to
786
+ request_num_mm_items or limit of multimodal items per prompt
787
+ for all modalities is reached.
788
+
789
+ Note:
790
+ - This function operates on a per-request shallow copy of
791
+ `bucket_config` (tuple->float). The original dict passed to
792
+ `sample` is not mutated. If this ever changes, a test
793
+ is implemented and will fail.
794
+ """
795
+ # Get the number of multimodal items to sample
796
+ request_num_mm_items = int(
797
+ self._rng.integers(min_num_mm_items, max_num_mm_items + 1)
798
+ )
799
+ # If request_num_mm_items is 0, yield an empty iterator
800
+ if request_num_mm_items == 0:
801
+ return
802
+ # Initialize modality counters
803
+ modality_counter = {self.map_config_to_modality(k): 0
804
+ for k in bucket_config}
805
+ # Copy the bucket config to avoid modifying the original
806
+ bucket_config_copy = bucket_config.copy()
807
+ # Loop over the number of multimodal items to sample
808
+ while sum(modality_counter.values()) < request_num_mm_items:
809
+ # Sample a multimodal item config
810
+ mm_item_config = self._rng.choice(list(bucket_config_copy.keys()),
811
+ p=list(bucket_config_copy.values()))
812
+ modality = self.map_config_to_modality(mm_item_config)
813
+ # Check that modality count is less than limit per prompt
814
+ if modality_counter[modality] < limit_mm_per_prompt[modality]:
815
+ modality_counter[modality] += 1
816
+ yield (
817
+ mm_item_config
818
+ )
819
+ else:
820
+ # If the counter is greater than the limit per prompt
821
+ # set all multimodal items of this modality to 0
822
+ for k, v in bucket_config_copy.items():
823
+ if self.map_config_to_modality(k) == modality:
824
+ bucket_config_copy[k] = 0
825
+ # If all configs are 0, break the loop
826
+ # This should not happen as request_num_mm_items is at most
827
+ # the sum of limit_mm_per_prompt for all modalities
828
+ if all(v == 0 for v in bucket_config_copy.values()):
829
+ logger.warning("Exhausted all multimodal items "
830
+ "of modality %s",
831
+ modality)
832
+ break
833
+ # Renormalize the bucket config
834
+ bucket_config_copy = self.normalize_bucket_config(
835
+ bucket_config_copy)
836
+
837
+
838
+ def sample(
839
+ self,
840
+ tokenizer: PreTrainedTokenizerBase,
841
+ num_requests: int,
842
+ request_id_prefix: str = "",
843
+ no_oversample: bool = False,
844
+ prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
845
+ range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
846
+ input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
847
+ output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN,
848
+ limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
849
+ base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
850
+ num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
851
+ bucket_config: dict[tuple[int, int, int], float] =
852
+ DEFAULT_MM_ITEM_BUCKET_CONFIG,
853
+ enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
854
+ **kwargs,
855
+ ) -> list[SampleRequest]:
856
+
857
+ # NOTE: Video sampling is WIP. Raise error if video is in bucket config
858
+ # and probability is non-zero.
859
+ if any(self.map_config_to_modality(cfg) == "video" and p > 0
860
+ for cfg, p in bucket_config.items()):
861
+ raise NotImplementedError("Video sampling not implemented; "
862
+ "set its probability to 0.")
863
+
864
+ # Get the sampling parameters for the dataset
865
+ input_lens, output_lens, offsets = self.get_sampling_params(
866
+ num_requests, range_ratio, input_len, output_len, tokenizer
867
+ )
868
+
869
+ (
870
+ min_num_mm_items,
871
+ max_num_mm_items,
872
+ limit_mm_per_prompt,
873
+ bucket_config,
874
+ ) = self.get_mm_item_sampling_params(
875
+ base_items_per_request,
876
+ num_mm_items_range_ratio,
877
+ limit_mm_per_prompt,
878
+ bucket_config,
879
+ )
880
+
881
+ # Generate prefix once
882
+ prefix_token_ids = self.get_prefix(tokenizer, prefix_len)
883
+ vocab_size = tokenizer.vocab_size
884
+ # Add synthetic multimodal items to each request
885
+ mm_requests = []
886
+ for i in range(num_requests):
887
+ prompt, total_input_len = self.generate_token_sequence(
888
+ tokenizer=tokenizer,
889
+ prefix_token_ids=prefix_token_ids,
890
+ prefix_len=prefix_len,
891
+ vocab_size=vocab_size,
892
+ input_len=int(input_lens[i]),
893
+ offset=int(offsets[i]),
894
+ index=i,
895
+ )
896
+ # Get multimodal item iterator for a given request
897
+ mm_item_iterator = self.get_mm_item_iterator(
898
+ min_num_mm_items,
899
+ max_num_mm_items,
900
+ bucket_config,
901
+ limit_mm_per_prompt,
902
+ )
903
+
904
+ mm_content = cast(list[dict[str, Any]], [
905
+ self.generate_mm_item(mm_item_config)
906
+ for mm_item_config in mm_item_iterator
907
+ ])
908
+
909
+ if enable_multimodal_chat:
910
+ # NOTE: For now this option is only provided for completeness
911
+ # given that the serve.py benchmark currently does not use it.
912
+ mm_chat_prompt: Any = prompt
913
+ mm_chat_prompt = self.apply_multimodal_chat_transformation(
914
+ prompt, mm_content)
915
+ sample_request = SampleRequest(
916
+ prompt=mm_chat_prompt,
917
+ prompt_len=total_input_len,
918
+ expected_output_len=int(output_lens[i]),
919
+ multi_modal_data=None,
920
+ request_id=request_id_prefix + str(i),
921
+ )
922
+ else:
923
+ sample_request = SampleRequest(
924
+ prompt=prompt,
925
+ prompt_len=total_input_len,
926
+ expected_output_len=int(output_lens[i]),
927
+ multi_modal_data=mm_content,
928
+ request_id=request_id_prefix + str(i),
929
+ )
930
+ mm_requests.append(sample_request)
931
+ return mm_requests
932
+
933
+ # -----------------------------------------------------------------------------
934
+ # ShareGPT Dataset Implementation
935
+ # -----------------------------------------------------------------------------
936
+
937
+
938
+ class ShareGPTDataset(BenchmarkDataset):
939
+ """
940
+ Implements the ShareGPT dataset. Loads data from a JSON file and generates
941
+ sample requests based on conversation turns.
942
+ """
943
+
944
+ def __init__(self, **kwargs) -> None:
945
+ super().__init__(**kwargs)
946
+ self.load_data()
947
+
948
+ def load_data(self) -> None:
949
+ if self.dataset_path is None:
950
+ raise ValueError("dataset_path must be provided for loading data.")
951
+
952
+ with open(self.dataset_path, encoding="utf-8") as f:
953
+ self.data = json.load(f)
954
+ # Filter entries with at least two conversation turns.
955
+ self.data = [
956
+ entry for entry in self.data
957
+ if "conversations" in entry and len(entry["conversations"]) >= 2
958
+ ]
959
+ random.seed(self.random_seed)
960
+ random.shuffle(self.data)
961
+
962
+ def sample(
963
+ self,
964
+ tokenizer: PreTrainedTokenizerBase,
965
+ num_requests: int,
966
+ lora_path: Optional[str] = None,
967
+ max_loras: Optional[int] = None,
968
+ output_len: Optional[int] = None,
969
+ enable_multimodal_chat: bool = False,
970
+ request_id_prefix: str = "",
971
+ no_oversample: bool = False,
972
+ **kwargs,
973
+ ) -> list:
974
+ samples: list = []
975
+ ind = 0
976
+ for entry in self.data:
977
+ if len(samples) >= num_requests:
978
+ break
979
+ prompt, completion = (
980
+ entry["conversations"][0]["value"],
981
+ entry["conversations"][1]["value"],
982
+ )
983
+
984
+ lora_request, tokenizer = self.get_random_lora_request(
985
+ tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
986
+ prompt_ids = tokenizer(prompt).input_ids
987
+ completion_ids = tokenizer(completion).input_ids
988
+ prompt_len = len(prompt_ids)
989
+ new_output_len = (len(completion_ids)
990
+ if output_len is None else output_len)
991
+ if not is_valid_sequence(prompt_len,
992
+ new_output_len,
993
+ skip_min_output_len_check=output_len
994
+ is not None):
995
+ continue
996
+ if image_path := entry.get("image"):
997
+ mm_content = process_image(image_path)
998
+ elif video_path := entry.get("video"):
999
+ mm_content = process_video(video_path)
1000
+ else:
1001
+ mm_content = None
1002
+ if enable_multimodal_chat:
1003
+ prompt = self.apply_multimodal_chat_transformation(
1004
+ prompt, mm_content)
1005
+ samples.append(
1006
+ SampleRequest(
1007
+ prompt=prompt,
1008
+ prompt_len=prompt_len,
1009
+ expected_output_len=new_output_len,
1010
+ lora_request=lora_request,
1011
+ multi_modal_data=mm_content,
1012
+ request_id=request_id_prefix + str(ind),
1013
+ ))
1014
+ ind += 1
1015
+ self.maybe_oversample_requests(samples,
1016
+ num_requests,
1017
+ request_id_prefix,
1018
+ no_oversample)
1019
+ return samples
1020
+
1021
+
1022
+ def add_dataset_parser(parser: FlexibleArgumentParser):
1023
+ parser.add_argument("--seed", type=int, default=0)
1024
+ parser.add_argument(
1025
+ "--num-prompts",
1026
+ type=int,
1027
+ default=1000,
1028
+ help="Number of prompts to process.",
1029
+ )
1030
+ parser.add_argument(
1031
+ "--dataset-name",
1032
+ type=str,
1033
+ default="random",
1034
+ choices=[
1035
+ "sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf",
1036
+ "custom", "prefix_repetition", "spec_bench"
1037
+ ],
1038
+ help="Name of the dataset to benchmark on.",
1039
+ )
1040
+ parser.add_argument(
1041
+ "--no-stream",
1042
+ action="store_true",
1043
+ help="Do not load the dataset in streaming mode.",
1044
+ )
1045
+ parser.add_argument(
1046
+ "--dataset-path",
1047
+ type=str,
1048
+ default=None,
1049
+ help="Path to the sharegpt/sonnet dataset. "
1050
+ "Or the huggingface dataset ID if using HF dataset.",
1051
+ )
1052
+ parser.add_argument(
1053
+ "--no-oversample",
1054
+ action="store_true",
1055
+ help="Do not oversample if the dataset has " \
1056
+ "fewer samples than num-prompts.",
1057
+ )
1058
+
1059
+ # group for dataset specific arguments
1060
+ custom_group = parser.add_argument_group("custom dataset options")
1061
+ custom_group.add_argument(
1062
+ "--custom-output-len",
1063
+ type=int,
1064
+ default=256,
1065
+ help=
1066
+ "Number of output tokens per request, used only for custom dataset.",
1067
+ )
1068
+ custom_group.add_argument(
1069
+ "--custom-skip-chat-template",
1070
+ action="store_true",
1071
+ help=
1072
+ "Skip applying chat template to prompt, used only for custom dataset.",
1073
+ )
1074
+
1075
+ spec_bench_group = parser.add_argument_group("spec bench dataset options")
1076
+ spec_bench_group.add_argument(
1077
+ "--spec-bench-output-len",
1078
+ type=int,
1079
+ default=256,
1080
+ help=
1081
+ "Num of output tokens per request, used only for spec bench dataset.",
1082
+ )
1083
+ spec_bench_group.add_argument(
1084
+ "--spec-bench-category",
1085
+ type=str,
1086
+ default=None,
1087
+ help=
1088
+ "Category for spec bench dataset. If None, use all categories.",
1089
+ )
1090
+
1091
+ sonnet_group = parser.add_argument_group("sonnet dataset options")
1092
+ sonnet_group.add_argument(
1093
+ "--sonnet-input-len",
1094
+ type=int,
1095
+ default=550,
1096
+ help=
1097
+ "Number of input tokens per request, used only for sonnet dataset.",
1098
+ )
1099
+ sonnet_group.add_argument(
1100
+ "--sonnet-output-len",
1101
+ type=int,
1102
+ default=150,
1103
+ help=
1104
+ "Number of output tokens per request, used only for sonnet dataset.",
1105
+ )
1106
+ sonnet_group.add_argument(
1107
+ "--sonnet-prefix-len",
1108
+ type=int,
1109
+ default=200,
1110
+ help=
1111
+ "Number of prefix tokens per request, used only for sonnet dataset.",
1112
+ )
1113
+
1114
+ sharegpt_group = parser.add_argument_group("sharegpt dataset options")
1115
+ sharegpt_group.add_argument(
1116
+ "--sharegpt-output-len",
1117
+ type=int,
1118
+ default=None,
1119
+ help="Output length for each request. Overrides the output length "
1120
+ "from the ShareGPT dataset.",
1121
+ )
1122
+
1123
+ blazedit_group = parser.add_argument_group("blazedit dataset options")
1124
+ blazedit_group.add_argument(
1125
+ "--blazedit-min-distance",
1126
+ type=float,
1127
+ default=0.0,
1128
+ help=
1129
+ "Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
1130
+ )
1131
+ blazedit_group.add_argument(
1132
+ "--blazedit-max-distance",
1133
+ type=float,
1134
+ default=1.0,
1135
+ help=
1136
+ "Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
1137
+ )
1138
+
1139
+ random_group = parser.add_argument_group("random dataset options")
1140
+ random_group.add_argument(
1141
+ "--random-input-len",
1142
+ type=int,
1143
+ default=1024,
1144
+ help=
1145
+ "Number of input tokens per request, used only for random sampling.",
1146
+ )
1147
+ random_group.add_argument(
1148
+ "--random-output-len",
1149
+ type=int,
1150
+ default=128,
1151
+ help=
1152
+ "Number of output tokens per request, used only for random sampling.",
1153
+ )
1154
+ random_group.add_argument(
1155
+ "--random-range-ratio",
1156
+ type=float,
1157
+ default=0.0,
1158
+ help="Range ratio for sampling input/output length, "
1159
+ "used only for random sampling. Must be in the range [0, 1) to define "
1160
+ "a symmetric sampling range"
1161
+ "[length * (1 - range_ratio), length * (1 + range_ratio)].",
1162
+ )
1163
+ random_group.add_argument(
1164
+ "--random-prefix-len",
1165
+ type=int,
1166
+ default=0,
1167
+ help=("Number of fixed prefix tokens before the random context "
1168
+ "in a request. "
1169
+ "The total input length is the sum of `random-prefix-len` and "
1170
+ "a random "
1171
+ "context length sampled from [input_len * (1 - range_ratio), "
1172
+ "input_len * (1 + range_ratio)]."),
1173
+ )
1174
+ random_group.add_argument(
1175
+ "--random-batch-size",
1176
+ type=int,
1177
+ default=1,
1178
+ help=("Batch size for random sampling. "
1179
+ "Only used for embeddings benchmark."),
1180
+ )
1181
+
1182
+ # random multimodal dataset options
1183
+ random_mm_group = parser.add_argument_group(
1184
+ "random multimodal dataset options extended from random dataset")
1185
+ random_mm_group.add_argument(
1186
+ "--random-mm-base-items-per-request",
1187
+ type=int,
1188
+ default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST,
1189
+ help=(
1190
+ "Base number of multimodal items per request for random-mm. "
1191
+ "Actual per-request count is sampled around this base using "
1192
+ "--random-mm-num-mm-items-range-ratio."
1193
+ ),
1194
+ )
1195
+ random_mm_group.add_argument(
1196
+ "--random-mm-num-mm-items-range-ratio",
1197
+ type=float,
1198
+ default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
1199
+ help=(
1200
+ "Range ratio r in [0, 1] for sampling items per request. "
1201
+ "We sample uniformly from the closed integer range "
1202
+ "[floor(n*(1-r)), ceil(n*(1+r))] "
1203
+ "where n is the base items per request. "
1204
+ "r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped "
1205
+ "to the sum of per-modality limits from "
1206
+ "--random-mm-limit-mm-per-prompt. "
1207
+ "An error is raised if the computed min exceeds the max."
1208
+ ),
1209
+ )
1210
+ random_mm_group.add_argument(
1211
+ "--random-mm-limit-mm-per-prompt",
1212
+ type=json.loads,
1213
+ default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT,
1214
+ help=(
1215
+ "Per-modality hard caps for items attached per request, e.g. "
1216
+ "'{\"image\": 3, \"video\": 0}'. The sampled per-request item "
1217
+ "count is clamped to the sum of these limits. When a modality "
1218
+ "reaches its cap, its buckets are excluded and probabilities are "
1219
+ "renormalized."
1220
+ "OBS.: Only image sampling is supported for now."
1221
+ ),
1222
+ )
1223
+
1224
+ def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]:
1225
+ # If already a dict (e.g., programmatic call), normalize keys
1226
+ def normalize(d: dict) -> dict[tuple[int, int, int], float]:
1227
+ out: dict[tuple[int, int, int], float] = {}
1228
+ for k, val in d.items():
1229
+ key = k
1230
+ if isinstance(key, str):
1231
+ with suppress(Exception):
1232
+ key = ast.literal_eval(key)
1233
+ if not (isinstance(key, tuple) and len(key) == 3
1234
+ and all(isinstance(x, int) for x in key)):
1235
+ raise ValueError(
1236
+ f"Invalid bucket key {k!r}. Expected tuple (H, W, T)."
1237
+ )
1238
+ out[(int(key[0]), int(key[1]), int(key[2]))] = float(val)
1239
+ return out
1240
+
1241
+ if isinstance(v, dict):
1242
+ return normalize(v)
1243
+ if isinstance(v, str):
1244
+ # Python literal (supports tuple keys)
1245
+ parsed = ast.literal_eval(v)
1246
+ if not isinstance(parsed, dict):
1247
+ raise ValueError("Bucket config must parse to a dict.")
1248
+ return normalize(parsed)
1249
+ raise ValueError("Unsupported value for --random-mm-bucket-config.")
1250
+
1251
+ random_mm_group.add_argument(
1252
+ "--random-mm-bucket-config",
1253
+ type=_parse_mm_bucket_config,
1254
+ default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG,
1255
+ help=(
1256
+ "The bucket config is a dictionary mapping a multimodal item"
1257
+ "sampling configuration to a probability."
1258
+ "Currently allows for 2 modalities: images and videos. "
1259
+ "An bucket key is a tuple of (height, width, num_frames)"
1260
+ "The value is the probability of sampling that specific item. "
1261
+ "Example: "
1262
+ "--random-mm-bucket-config "
1263
+ "{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} "
1264
+ "First item: images with resolution 256x256 w.p. 0.5"
1265
+ "Second item: images with resolution 720x1280 w.p. 0.4 "
1266
+ "Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1"
1267
+ "OBS.: If the probabilities do not sum to 1, they are normalized."
1268
+ "OBS bis.: Only image sampling is supported for now."
1269
+ ),
1270
+ )
1271
+
1272
+ hf_group = parser.add_argument_group("hf dataset options")
1273
+ hf_group.add_argument("--hf-subset",
1274
+ type=str,
1275
+ default=None,
1276
+ help="Subset of the HF dataset.")
1277
+ hf_group.add_argument("--hf-split",
1278
+ type=str,
1279
+ default=None,
1280
+ help="Split of the HF dataset.")
1281
+ hf_group.add_argument(
1282
+ "--hf-name",
1283
+ type=str,
1284
+ default=None,
1285
+ help=(
1286
+ "Name of the dataset on HuggingFace "
1287
+ "(e.g., 'lmarena-ai/VisionArena-Chat'). "
1288
+ "Specify this if your dataset-path is a local path."
1289
+ ),
1290
+ )
1291
+ hf_group.add_argument(
1292
+ "--hf-output-len",
1293
+ type=int,
1294
+ default=None,
1295
+ help="Output length for each request. Overrides the output lengths "
1296
+ "from the sampled HF dataset.",
1297
+ )
1298
+
1299
+ prefix_repetition_group = parser.add_argument_group(
1300
+ "prefix repetition dataset options")
1301
+ prefix_repetition_group.add_argument(
1302
+ "--prefix-repetition-prefix-len",
1303
+ type=int,
1304
+ default=256,
1305
+ help="Number of prefix tokens per request, used only for prefix "
1306
+ "repetition dataset.",
1307
+ )
1308
+ prefix_repetition_group.add_argument(
1309
+ "--prefix-repetition-suffix-len",
1310
+ type=int,
1311
+ default=256,
1312
+ help="Number of suffix tokens per request, used only for prefix "
1313
+ "repetition dataset. Total input length is prefix_len + suffix_len.",
1314
+ )
1315
+ prefix_repetition_group.add_argument(
1316
+ "--prefix-repetition-num-prefixes",
1317
+ type=int,
1318
+ default=10,
1319
+ help="Number of prefixes to generate, used only for prefix repetition "
1320
+ "dataset. Prompts per prefix is num_requests // num_prefixes.",
1321
+ )
1322
+ prefix_repetition_group.add_argument(
1323
+ "--prefix-repetition-output-len",
1324
+ type=int,
1325
+ default=128,
1326
+ help="Number of output tokens per request, used only for prefix "
1327
+ "repetition dataset.",
1328
+ )
1329
+
1330
+
1331
+ def get_samples(args, tokenizer) -> list[SampleRequest]:
1332
+
1333
+ if not hasattr(args, "request_id_prefix"):
1334
+ args.request_id_prefix = ""
1335
+
1336
+ if args.dataset_name == "custom":
1337
+ dataset = CustomDataset(dataset_path=args.dataset_path)
1338
+ input_requests = dataset.sample(
1339
+ num_requests=args.num_prompts,
1340
+ tokenizer=tokenizer,
1341
+ output_len=args.custom_output_len,
1342
+ skip_chat_template=args.custom_skip_chat_template,
1343
+ request_id_prefix=args.request_id_prefix,
1344
+ no_oversample=args.no_oversample,
1345
+ )
1346
+
1347
+ elif args.dataset_name == "sonnet":
1348
+ dataset = SonnetDataset(dataset_path=args.dataset_path)
1349
+ # For the "sonnet" dataset, formatting depends on the backend.
1350
+ if args.endpoint_type == "openai-chat":
1351
+ input_requests = dataset.sample(
1352
+ num_requests=args.num_prompts,
1353
+ input_len=args.sonnet_input_len,
1354
+ output_len=args.sonnet_output_len,
1355
+ prefix_len=args.sonnet_prefix_len,
1356
+ tokenizer=tokenizer,
1357
+ return_prompt_formatted=False,
1358
+ request_id_prefix=args.request_id_prefix,
1359
+ no_oversample=args.no_oversample,
1360
+ )
1361
+ else:
1362
+ assert tokenizer.chat_template or tokenizer.default_chat_template, (
1363
+ "Tokenizer/model must have chat template for sonnet dataset.")
1364
+ input_requests = dataset.sample(
1365
+ num_requests=args.num_prompts,
1366
+ input_len=args.sonnet_input_len,
1367
+ output_len=args.sonnet_output_len,
1368
+ prefix_len=args.sonnet_prefix_len,
1369
+ tokenizer=tokenizer,
1370
+ return_prompt_formatted=True,
1371
+ request_id_prefix=args.request_id_prefix,
1372
+ no_oversample=args.no_oversample,
1373
+ )
1374
+
1375
+ elif args.dataset_name == "hf":
1376
+ # all following datasets are implemented from the
1377
+ # HuggingFaceDataset base class
1378
+ hf_kwargs = {}
1379
+ if (
1380
+ args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1381
+ or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1382
+ ):
1383
+ dataset_class = VisionArenaDataset
1384
+ args.hf_split = "train"
1385
+ args.hf_subset = None
1386
+ elif (
1387
+ args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1388
+ or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1389
+ ):
1390
+ dataset_class = InstructCoderDataset
1391
+ args.hf_split = "train"
1392
+ elif (
1393
+ args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
1394
+ or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
1395
+ ):
1396
+ dataset_class = MTBenchDataset
1397
+ args.hf_split = "train"
1398
+ elif (
1399
+ args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
1400
+ or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
1401
+ ):
1402
+ dataset_class = ConversationDataset
1403
+ elif (
1404
+ args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
1405
+ or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
1406
+ ):
1407
+ dataset_class = AIMODataset
1408
+ args.hf_split = "train"
1409
+ elif (
1410
+ args.dataset_path
1411
+ in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501
1412
+ or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
1413
+ ):
1414
+ dataset_class = NextEditPredictionDataset
1415
+ args.hf_split = "train"
1416
+ elif (
1417
+ args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
1418
+ or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
1419
+ ):
1420
+ dataset_class = ASRDataset
1421
+ args.hf_split = "train"
1422
+ elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS:
1423
+ dataset_class = BlazeditDataset
1424
+ args.hf_split = "train"
1425
+ hf_kwargs = {
1426
+ "min_distance": args.blazedit_min_distance,
1427
+ "max_distance": args.blazedit_max_distance,
1428
+ }
1429
+ elif (
1430
+ args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
1431
+ or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
1432
+ ):
1433
+ dataset_class = MLPerfDataset
1434
+ args.hf_split = "train"
1435
+ else:
1436
+ supported_datasets = set([
1437
+ dataset_name for cls in HuggingFaceDataset.__subclasses__()
1438
+ for dataset_name in cls.SUPPORTED_DATASET_PATHS
1439
+ ])
1440
+ raise ValueError(
1441
+ f"Unsupported dataset path: {args.dataset_path}. "
1442
+ "Huggingface dataset only supports dataset_path"
1443
+ f" from one of following: {supported_datasets}. "
1444
+ "Please consider contributing if you would "
1445
+ "like to add support for additional dataset formats.")
1446
+
1447
+ if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [
1448
+ "openai-chat",
1449
+ "openai-audio",
1450
+ ]:
1451
+ # multi-modal benchmark is only available on OpenAI Chat
1452
+ # endpoint-type.
1453
+ raise ValueError(
1454
+ "Multi-modal content is only supported on 'openai-chat' and "
1455
+ "'openai-audio' endpoint-type.")
1456
+ input_requests = dataset_class(
1457
+ dataset_path=args.dataset_path,
1458
+ dataset_subset=args.hf_subset,
1459
+ dataset_split=args.hf_split,
1460
+ random_seed=args.seed,
1461
+ no_stream=args.no_stream,
1462
+ hf_name=args.hf_name,
1463
+ ).sample(
1464
+ num_requests=args.num_prompts,
1465
+ tokenizer=tokenizer,
1466
+ output_len=args.hf_output_len,
1467
+ request_id_prefix=args.request_id_prefix,
1468
+ no_oversample=args.no_oversample,
1469
+ **hf_kwargs
1470
+ )
1471
+
1472
+ else:
1473
+ # For datasets that follow a similar structure, use a mapping.
1474
+ dataset_mapping = {
1475
+ "spec_bench":
1476
+ lambda: SpecBench(dataset_path=args.dataset_path,
1477
+ category=args.spec_bench_category).sample(
1478
+ num_requests=args.num_prompts,
1479
+ tokenizer=tokenizer,
1480
+ output_len=args.spec_bench_output_len,
1481
+ request_id_prefix=args.request_id_prefix,
1482
+ no_oversample=args.no_oversample,
1483
+ ),
1484
+ "sharegpt": lambda: ShareGPTDataset(
1485
+ random_seed=args.seed, dataset_path=args.dataset_path
1486
+ ).sample(
1487
+ tokenizer=tokenizer,
1488
+ num_requests=args.num_prompts,
1489
+ output_len=args.sharegpt_output_len,
1490
+ request_id_prefix=args.request_id_prefix,
1491
+ no_oversample=args.no_oversample,
1492
+ ),
1493
+ "burstgpt": lambda: BurstGPTDataset(
1494
+ random_seed=args.seed, dataset_path=args.dataset_path
1495
+ ).sample(
1496
+ tokenizer=tokenizer,
1497
+ num_requests=args.num_prompts,
1498
+ request_id_prefix=args.request_id_prefix,
1499
+ no_oversample=args.no_oversample,
1500
+ ),
1501
+ "random": lambda: RandomDataset(
1502
+ random_seed=args.seed, dataset_path=args.dataset_path
1503
+ ).sample(
1504
+ tokenizer=tokenizer,
1505
+ num_requests=args.num_prompts,
1506
+ prefix_len=args.random_prefix_len,
1507
+ input_len=args.random_input_len,
1508
+ output_len=args.random_output_len,
1509
+ range_ratio=args.random_range_ratio,
1510
+ request_id_prefix=args.request_id_prefix,
1511
+ batchsize=args.random_batch_size,
1512
+ no_oversample=args.no_oversample,
1513
+ ),
1514
+ "random-mm":
1515
+ lambda: RandomMultiModalDataset(
1516
+ random_seed=args.seed, dataset_path=args.dataset_path
1517
+ ).sample(
1518
+ tokenizer=tokenizer,
1519
+ num_requests=args.num_prompts,
1520
+ prefix_len=args.random_prefix_len,
1521
+ range_ratio=args.random_range_ratio,
1522
+ input_len=args.random_input_len,
1523
+ output_len=args.random_output_len,
1524
+ base_items_per_request=args.random_mm_base_items_per_request,
1525
+ limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
1526
+ num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
1527
+ bucket_config=args.random_mm_bucket_config,
1528
+ request_id_prefix=args.request_id_prefix,
1529
+ no_oversample=args.no_oversample,
1530
+ ),
1531
+ "prefix_repetition":
1532
+ lambda: PrefixRepetitionRandomDataset(
1533
+ random_seed=args.seed, dataset_path=args.dataset_path
1534
+ ).sample(
1535
+ tokenizer=tokenizer,
1536
+ num_requests=args.num_prompts,
1537
+ prefix_len=args.prefix_repetition_prefix_len,
1538
+ suffix_len=args.prefix_repetition_suffix_len,
1539
+ num_prefixes=args.prefix_repetition_num_prefixes,
1540
+ output_len=args.prefix_repetition_output_len,
1541
+ request_id_prefix=args.request_id_prefix,
1542
+ no_oversample=args.no_oversample,
1543
+ ),
1544
+ }
1545
+
1546
+ try:
1547
+ # Enforce endpoint compatibility for multimodal datasets.
1548
+ if args.dataset_name == "random-mm" and args.endpoint_type not in [
1549
+ "openai-chat"]:
1550
+ raise ValueError(
1551
+ "Multi-modal content (images) is only supported on "
1552
+ "'openai-chat' backend."
1553
+ )
1554
+ input_requests = dataset_mapping[args.dataset_name]()
1555
+ except KeyError as err:
1556
+ raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
1557
+
1558
+ return input_requests
1559
+
1560
+
1561
+ # -----------------------------------------------------------------------------
1562
+ # Custom Dataset Implementation
1563
+ # -----------------------------------------------------------------------------
1564
+
1565
+
1566
+ class CustomDataset(BenchmarkDataset):
1567
+ """
1568
+ Implements the Custom dataset. Loads data from a JSONL file and generates
1569
+ sample requests based on conversation turns. E.g.,
1570
+ ```
1571
+ {"prompt": "What is the capital of India?"}
1572
+ {"prompt": "What is the capital of Iran?"}
1573
+ {"prompt": "What is the capital of China?"}
1574
+ ```
1575
+ """
1576
+
1577
+ def __init__(self, **kwargs) -> None:
1578
+ super().__init__(**kwargs)
1579
+ self.load_data()
1580
+
1581
+ def load_data(self) -> None:
1582
+ if self.dataset_path is None:
1583
+ raise ValueError("dataset_path must be provided for loading data.")
1584
+
1585
+ # self.data will be a list of dictionaries
1586
+ # e.g., [{"prompt": "What is the capital of India?"}, ...]
1587
+ # This will be the standardized format which load_data()
1588
+ # has to convert into depending on the filetype of dataset_path.
1589
+ # sample() will assume this standardized format of self.data
1590
+ self.data = []
1591
+
1592
+ # Load the JSONL file
1593
+ if self.dataset_path.endswith(".jsonl"):
1594
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
1595
+ lines=True)
1596
+
1597
+ # check if the JSONL file has a 'prompt' column
1598
+ if "prompt" not in jsonl_data.columns:
1599
+ raise ValueError("JSONL file must contain a 'prompt' column.")
1600
+
1601
+ # Convert each row to a dictionary and append to self.data
1602
+ # This will convert the DataFrame to a list of dictionaries
1603
+ # where each dictionary corresponds to a row in the DataFrame.
1604
+ # This is the standardized format we want for self.data
1605
+ for _, row in jsonl_data.iterrows():
1606
+ self.data.append(row.to_dict())
1607
+ else:
1608
+ raise NotImplementedError(
1609
+ "Only JSONL format is supported for CustomDataset.")
1610
+
1611
+ random.seed(self.random_seed)
1612
+ random.shuffle(self.data)
1613
+
1614
+ def sample(
1615
+ self,
1616
+ tokenizer: PreTrainedTokenizerBase,
1617
+ num_requests: int,
1618
+ lora_path: Optional[str] = None,
1619
+ max_loras: Optional[int] = None,
1620
+ output_len: Optional[int] = None,
1621
+ enable_multimodal_chat: bool = False,
1622
+ skip_chat_template: bool = False,
1623
+ request_id_prefix: str = "",
1624
+ no_oversample: bool = False,
1625
+ **kwargs,
1626
+ ) -> list:
1627
+ # load all data if needed
1628
+ self.num_available_samples = len(self.data)
1629
+ if num_requests <= 0:
1630
+ num_requests = self.num_available_samples
1631
+ logger.info("num_requests is set to 0 or negative, "
1632
+ "so using all available samples: %d",
1633
+ num_requests)
1634
+
1635
+ sampled_requests = []
1636
+ for i, item in enumerate(self.data):
1637
+ if len(sampled_requests) >= num_requests:
1638
+ break
1639
+ prompt = item["prompt"]
1640
+
1641
+ # apply template
1642
+ if not skip_chat_template:
1643
+ prompt = tokenizer.apply_chat_template(
1644
+ [{
1645
+ "role": "user",
1646
+ "content": prompt
1647
+ }],
1648
+ add_generation_prompt=True,
1649
+ tokenize=False,
1650
+ )
1651
+
1652
+ prompt_len = len(tokenizer(prompt).input_ids)
1653
+ sampled_requests.append(
1654
+ SampleRequest(
1655
+ prompt=prompt,
1656
+ prompt_len=prompt_len,
1657
+ expected_output_len=output_len,
1658
+ request_id=request_id_prefix + str(i),
1659
+ ))
1660
+ self.maybe_oversample_requests(sampled_requests, num_requests,
1661
+ request_id_prefix, no_oversample)
1662
+
1663
+ return sampled_requests
1664
+
1665
+
1666
+ # -----------------------------------------------------------------------------
1667
+ # Spec Bench Dataset Implementation
1668
+ # -----------------------------------------------------------------------------
1669
+
1670
+
1671
+ class SpecBench(CustomDataset):
1672
+ """
1673
+ Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
1674
+ Download the dataset using:
1675
+ wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
1676
+ """ # noqa: E501
1677
+
1678
+ def __init__(self, **kwargs) -> None:
1679
+ self.category = kwargs.pop("category", None)
1680
+ super().__init__(**kwargs)
1681
+ self.load_data()
1682
+
1683
+ def load_data(self) -> None:
1684
+ if self.dataset_path is None:
1685
+ raise ValueError("dataset_path must be provided for loading data.")
1686
+
1687
+ self.data = []
1688
+
1689
+ # Load the JSONL file
1690
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
1691
+ lines=True)
1692
+
1693
+ # check if the JSONL file has a 'turns' column
1694
+ if "turns" not in jsonl_data.columns:
1695
+ raise ValueError("JSONL file must contain a 'turns' column.")
1696
+
1697
+ for _, row in jsonl_data.iterrows():
1698
+ # sample only from a specific category if specified
1699
+ if (not self.category) or (self.category == row['category']):
1700
+ prompt = row["turns"][0]
1701
+ self.data.append({"prompt": prompt})
1702
+
1703
+ random.seed(self.random_seed)
1704
+ random.shuffle(self.data)
1705
+
1706
+ def sample(self, **kwargs) -> list:
1707
+ # leverage CustomDataset sample
1708
+ kwargs["skip_chat_template"] = False
1709
+ return super().sample(**kwargs)
1710
+
1711
+
1712
+ # -----------------------------------------------------------------------------
1713
+ # Sonnet Dataset Implementation
1714
+ # -----------------------------------------------------------------------------
1715
+
1716
+ @deprecated(
1717
+ "SonnetDataset is deprecated and will be removed in a future version.",
1718
+ )
1719
+ class SonnetDataset(BenchmarkDataset):
1720
+ """
1721
+ Simplified implementation of the Sonnet dataset. Loads poem lines from a
1722
+ text file and generates sample requests. Default values here copied from
1723
+ `benchmark_serving.py` for the sonnet dataset.
1724
+ """
1725
+
1726
+ DEFAULT_PREFIX_LEN = 200
1727
+ DEFAULT_INPUT_LEN = 550
1728
+ DEFAULT_OUTPUT_LEN = 150
1729
+
1730
+ def __init__(
1731
+ self,
1732
+ **kwargs,
1733
+ ) -> None:
1734
+ super().__init__(**kwargs)
1735
+ self.load_data()
1736
+
1737
+ def load_data(self) -> None:
1738
+ if not self.dataset_path:
1739
+ raise ValueError("dataset_path must be provided.")
1740
+ with open(self.dataset_path, encoding="utf-8") as f:
1741
+ self.data = f.readlines()
1742
+
1743
+ def sample(
1744
+ self,
1745
+ tokenizer,
1746
+ num_requests: int,
1747
+ prefix_len: int = DEFAULT_PREFIX_LEN,
1748
+ input_len: int = DEFAULT_INPUT_LEN,
1749
+ output_len: int = DEFAULT_OUTPUT_LEN,
1750
+ return_prompt_formatted: bool = False,
1751
+ request_id_prefix: str = "",
1752
+ no_oversample: bool = False,
1753
+ **kwargs,
1754
+ ) -> list:
1755
+ # Calculate average token length for a poem line.
1756
+ tokenized_lines = [tokenizer(line).input_ids for line in self.data]
1757
+ avg_len = sum(len(tokens)
1758
+ for tokens in tokenized_lines) / len(tokenized_lines)
1759
+
1760
+ # Build the base prompt.
1761
+ base_prompt = "Pick as many lines as you can from these poem lines:\n"
1762
+ base_msg = [{"role": "user", "content": base_prompt}]
1763
+ base_fmt = tokenizer.apply_chat_template(base_msg,
1764
+ add_generation_prompt=True,
1765
+ tokenize=False)
1766
+ base_offset = len(tokenizer(base_fmt).input_ids)
1767
+ if input_len <= base_offset:
1768
+ raise ValueError(
1769
+ f"'input_len' must be higher than the base prompt length "
1770
+ f"({base_offset}).")
1771
+
1772
+ # Determine how many poem lines to use.
1773
+ num_input_lines = round((input_len - base_offset) / avg_len)
1774
+ num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
1775
+ prefix_lines = self.data[:num_prefix_lines]
1776
+
1777
+ samples = []
1778
+ ind = 0
1779
+ while len(samples) < num_requests:
1780
+ extra_lines = random.choices(self.data,
1781
+ k=num_input_lines - num_prefix_lines)
1782
+ prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
1783
+ msg = [{"role": "user", "content": prompt}]
1784
+ prompt_formatted = tokenizer.apply_chat_template(
1785
+ msg, add_generation_prompt=True, tokenize=False)
1786
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
1787
+ if prompt_len <= input_len:
1788
+ samples.append(
1789
+ SampleRequest(
1790
+ prompt=prompt_formatted
1791
+ if return_prompt_formatted else prompt,
1792
+ prompt_len=prompt_len,
1793
+ expected_output_len=output_len,
1794
+ request_id=request_id_prefix + str(ind),
1795
+ ))
1796
+ ind += 1
1797
+ return samples
1798
+
1799
+
1800
+ # -----------------------------------------------------------------------------
1801
+ # BurstGPT Dataset Implementation
1802
+ # -----------------------------------------------------------------------------
1803
+
1804
+
1805
+ class BurstGPTDataset(BenchmarkDataset):
1806
+ """
1807
+ Implements the BurstGPT dataset. Loads data from a CSV file and generates
1808
+ sample requests based on synthetic prompt generation. Only rows with Model
1809
+ "GPT-4" and positive response tokens are used.
1810
+ """
1811
+
1812
+ def __init__(self, **kwargs) -> None:
1813
+ super().__init__(**kwargs)
1814
+ self.load_data()
1815
+
1816
+ def load_data(self, ):
1817
+ if self.dataset_path is None:
1818
+ raise ValueError("dataset_path must be provided for loading data.")
1819
+
1820
+ df = pd.read_csv(self.dataset_path)
1821
+ # Filter to keep only GPT-4 rows.
1822
+ gpt4_df = df[df["Model"] == "GPT-4"]
1823
+ # Remove failed requests (where Response tokens is 0 or less).
1824
+ gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
1825
+ # Sample the desired number of rows.
1826
+ self.data = gpt4_df
1827
+
1828
+ def _sample_loaded_data(self, num_requests: int) -> list:
1829
+ if num_requests <= len(self.data):
1830
+ data = self.data.sample(n=num_requests,
1831
+ random_state=self.random_seed)
1832
+ else:
1833
+ data = self.data.sample(
1834
+ n=num_requests,
1835
+ random_state=self.random_seed,
1836
+ replace=True,
1837
+ )
1838
+ # Convert the dataframe to a list of lists.
1839
+ return data.values.tolist()
1840
+
1841
+ def sample(
1842
+ self,
1843
+ tokenizer: PreTrainedTokenizerBase,
1844
+ num_requests: int,
1845
+ max_loras: Optional[int] = None,
1846
+ lora_path: Optional[str] = None,
1847
+ request_id_prefix: str = "",
1848
+ no_oversample: bool = False,
1849
+ **kwargs,
1850
+ ) -> list[SampleRequest]:
1851
+ samples = []
1852
+ data = self._sample_loaded_data(num_requests=num_requests)
1853
+ for i in range(num_requests):
1854
+ input_len = int(data[i][2])
1855
+ output_len = int(data[i][3])
1856
+ lora_req, tokenizer = self.get_random_lora_request(
1857
+ tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
1858
+ vocab_size = tokenizer.vocab_size
1859
+ # Generate a synthetic prompt: a list of token IDs computed as (i +
1860
+ # j) modulo vocab_size.
1861
+ token_ids = [(i + j) % vocab_size for j in range(input_len)]
1862
+ prompt = tokenizer.decode(token_ids)
1863
+ samples.append(
1864
+ SampleRequest(
1865
+ prompt=prompt,
1866
+ prompt_len=input_len,
1867
+ expected_output_len=output_len,
1868
+ lora_request=lora_req,
1869
+ request_id=request_id_prefix + str(i),
1870
+ ))
1871
+ return samples
1872
+
1873
+
1874
+ # -----------------------------------------------------------------------------
1875
+ # HuggingFace Dataset Base Implementation
1876
+ # -----------------------------------------------------------------------------
1877
+ class HuggingFaceDataset(BenchmarkDataset):
1878
+ """Base class for datasets hosted on HuggingFace."""
1879
+
1880
+ SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
1881
+
1882
+ def __init__(
1883
+ self,
1884
+ dataset_path: str,
1885
+ dataset_split: str,
1886
+ no_stream: bool = False,
1887
+ dataset_subset: Optional[str] = None,
1888
+ hf_name: Optional[str] = None,
1889
+ **kwargs,
1890
+ ) -> None:
1891
+ super().__init__(dataset_path=dataset_path, **kwargs)
1892
+
1893
+ self.dataset_split = dataset_split
1894
+ self.dataset_subset = dataset_subset
1895
+ self.load_stream = not no_stream
1896
+ self.hf_name = hf_name or dataset_path
1897
+ self.load_data()
1898
+
1899
+ def load_data(self) -> None:
1900
+ """Load data from HuggingFace datasets."""
1901
+ self.data = load_dataset(
1902
+ self.dataset_path,
1903
+ name=self.dataset_subset,
1904
+ split=self.dataset_split,
1905
+ streaming=self.load_stream,
1906
+ )
1907
+ self.data = self.data.shuffle(seed=self.random_seed)
1908
+
1909
+
1910
+ # -----------------------------------------------------------------------------
1911
+ # Conversation Dataset Implementation
1912
+ # -----------------------------------------------------------------------------
1913
+
1914
+
1915
+ class ConversationDataset(HuggingFaceDataset):
1916
+ """Dataset for conversation data with multimodal support."""
1917
+ SUPPORTED_DATASET_PATHS = {
1918
+ 'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
1919
+ }
1920
+ IS_MULTIMODAL = True
1921
+
1922
+ def sample(self,
1923
+ tokenizer: PreTrainedTokenizerBase,
1924
+ num_requests: int,
1925
+ output_len: Optional[int] = None,
1926
+ enable_multimodal_chat: bool = False,
1927
+ request_id_prefix: str = "",
1928
+ no_oversample: bool = False,
1929
+ **kwargs) -> list:
1930
+ # Filter examples with at least 2 conversations
1931
+ filtered_data = self.data.filter(
1932
+ lambda x: len(x["conversations"]) >= 2)
1933
+ sampled_requests = []
1934
+ ind = 0
1935
+ dynamic_output = output_len is None
1936
+
1937
+ for item in filtered_data:
1938
+ if len(sampled_requests) >= num_requests:
1939
+ break
1940
+ conv = item["conversations"]
1941
+ prompt, completion = conv[0]["value"], conv[1]["value"]
1942
+
1943
+ prompt_ids = tokenizer(prompt).input_ids
1944
+ completion_ids = tokenizer(completion).input_ids
1945
+ prompt_len = len(prompt_ids)
1946
+ completion_len = len(completion_ids)
1947
+ output_len = completion_len if dynamic_output else output_len
1948
+ assert isinstance(output_len, int) and output_len > 0
1949
+ if dynamic_output and not is_valid_sequence(
1950
+ prompt_len, completion_len):
1951
+ continue
1952
+ mm_content = process_image(
1953
+ item["image"]) if "image" in item else None
1954
+ if enable_multimodal_chat:
1955
+ # Note: when chat is enabled the request prompt_len is no longer
1956
+ # accurate and we will be using request output to count the
1957
+ # actual prompt len and output len
1958
+ prompt = self.apply_multimodal_chat_transformation(
1959
+ prompt, mm_content)
1960
+ sampled_requests.append(
1961
+ SampleRequest(
1962
+ prompt=prompt,
1963
+ prompt_len=prompt_len,
1964
+ expected_output_len=output_len,
1965
+ multi_modal_data=mm_content,
1966
+ request_id=request_id_prefix + str(ind),
1967
+ ))
1968
+ ind += 1
1969
+ self.maybe_oversample_requests(sampled_requests, num_requests,
1970
+ request_id_prefix, no_oversample)
1971
+ return sampled_requests
1972
+
1973
+
1974
+ # -----------------------------------------------------------------------------
1975
+ # Vision Arena Dataset Implementation
1976
+ # -----------------------------------------------------------------------------
1977
+
1978
+
1979
+ class VisionArenaDataset(HuggingFaceDataset):
1980
+ """
1981
+ Vision Arena Dataset.
1982
+ """
1983
+
1984
+ DEFAULT_OUTPUT_LEN = 128
1985
+ SUPPORTED_DATASET_PATHS = {
1986
+ "lmarena-ai/VisionArena-Chat":
1987
+ lambda x: x["conversation"][0][0]["content"],
1988
+ "lmarena-ai/vision-arena-bench-v0.1":
1989
+ lambda x: x["turns"][0][0]["content"]
1990
+ }
1991
+ IS_MULTIMODAL = True
1992
+
1993
+ def sample(
1994
+ self,
1995
+ tokenizer: PreTrainedTokenizerBase,
1996
+ num_requests: int,
1997
+ output_len: Optional[int] = None,
1998
+ enable_multimodal_chat: bool = False,
1999
+ request_id_prefix: str = "",
2000
+ no_oversample: bool = False,
2001
+ **kwargs,
2002
+ ) -> list:
2003
+ output_len = (output_len
2004
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2005
+ sampled_requests = []
2006
+ for i, item in enumerate(self.data):
2007
+ if len(sampled_requests) >= num_requests:
2008
+ break
2009
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2010
+ if parser_fn is None:
2011
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2012
+ prompt = parser_fn(item)
2013
+ mm_content = process_image(item["images"][0])
2014
+ prompt_len = len(tokenizer(prompt).input_ids)
2015
+ if enable_multimodal_chat:
2016
+ # Note: when chat is enabled the request prompt_len is no longer
2017
+ # accurate and we will be using request output to count the
2018
+ # actual prompt len
2019
+ prompt = self.apply_multimodal_chat_transformation(
2020
+ prompt, mm_content)
2021
+ sampled_requests.append(
2022
+ SampleRequest(
2023
+ prompt=prompt,
2024
+ prompt_len=prompt_len,
2025
+ expected_output_len=output_len,
2026
+ multi_modal_data=mm_content,
2027
+ request_id=request_id_prefix + str(i),
2028
+ ))
2029
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2030
+ request_id_prefix, no_oversample)
2031
+ return sampled_requests
2032
+
2033
+
2034
+ # -----------------------------------------------------------------------------
2035
+ # Instruct Coder Dataset Implementation
2036
+ # -----------------------------------------------------------------------------
2037
+
2038
+
2039
+ class InstructCoderDataset(HuggingFaceDataset):
2040
+ """
2041
+ InstructCoder Dataset.
2042
+ https://huggingface.co/datasets/likaixin/InstructCoder
2043
+
2044
+ InstructCoder is the dataset designed for general code editing. It consists
2045
+ of 114,239 instruction-input-output triplets, and covers multiple distinct
2046
+ code editing scenario.
2047
+ """
2048
+
2049
+ DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
2050
+ SUPPORTED_DATASET_PATHS = {
2051
+ "likaixin/InstructCoder",
2052
+ }
2053
+
2054
+ def sample(self,
2055
+ tokenizer: PreTrainedTokenizerBase,
2056
+ num_requests: int,
2057
+ output_len: Optional[int] = None,
2058
+ enable_multimodal_chat: bool = False,
2059
+ request_id_prefix: str = "",
2060
+ no_oversample: bool = False,
2061
+ **kwargs) -> list:
2062
+ output_len = (output_len
2063
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2064
+ sampled_requests = []
2065
+ for i, item in enumerate(self.data):
2066
+ if len(sampled_requests) >= num_requests:
2067
+ break
2068
+ prompt = (
2069
+ f"{item['input']}\n\n{item['instruction']} Just output "
2070
+ "the code, do not include any explanation."
2071
+ )
2072
+
2073
+ # apply template
2074
+ prompt = tokenizer.apply_chat_template(
2075
+ [{
2076
+ "role": "user",
2077
+ "content": prompt
2078
+ }],
2079
+ add_generation_prompt=True,
2080
+ tokenize=False,
2081
+ )
2082
+
2083
+ prompt_len = len(tokenizer(prompt).input_ids)
2084
+ sampled_requests.append(
2085
+ SampleRequest(
2086
+ prompt=prompt,
2087
+ prompt_len=prompt_len,
2088
+ expected_output_len=output_len,
2089
+ request_id=request_id_prefix + str(i),
2090
+ ))
2091
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2092
+ request_id_prefix, no_oversample)
2093
+ return sampled_requests
2094
+
2095
+
2096
+ # -----------------------------------------------------------------------------
2097
+ # MT-Bench Dataset Implementation
2098
+ # -----------------------------------------------------------------------------
2099
+
2100
+
2101
+ class MTBenchDataset(HuggingFaceDataset):
2102
+ """
2103
+ MT-Bench Dataset.
2104
+ https://huggingface.co/datasets/philschmid/mt-bench
2105
+
2106
+ We create a single turn dataset for MT-Bench.
2107
+ This is similar to Spec decoding benchmark setup in vLLM
2108
+ https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
2109
+ """ # noqa: E501
2110
+
2111
+ DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
2112
+ SUPPORTED_DATASET_PATHS = {
2113
+ "philschmid/mt-bench",
2114
+ }
2115
+
2116
+ def sample(
2117
+ self,
2118
+ tokenizer: PreTrainedTokenizerBase,
2119
+ num_requests: int,
2120
+ output_len: Optional[int] = None,
2121
+ enable_multimodal_chat: bool = False,
2122
+ request_id_prefix: str = "",
2123
+ no_oversample: bool = False,
2124
+ **kwargs,
2125
+ ) -> list:
2126
+ output_len = (output_len
2127
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2128
+ sampled_requests = []
2129
+
2130
+ for i, item in enumerate(self.data):
2131
+ if len(sampled_requests) >= num_requests:
2132
+ break
2133
+ prompt = item["turns"][0]
2134
+
2135
+ # apply template
2136
+ prompt = tokenizer.apply_chat_template(
2137
+ [{
2138
+ "role": "user",
2139
+ "content": prompt
2140
+ }],
2141
+ add_generation_prompt=True,
2142
+ tokenize=False,
2143
+ )
2144
+
2145
+ prompt_len = len(tokenizer(prompt).input_ids)
2146
+ sampled_requests.append(
2147
+ SampleRequest(
2148
+ prompt=prompt,
2149
+ prompt_len=prompt_len,
2150
+ expected_output_len=output_len,
2151
+ request_id=request_id_prefix + str(i),
2152
+ ))
2153
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2154
+ request_id_prefix, no_oversample)
2155
+ return sampled_requests
2156
+
2157
+
2158
+ # -----------------------------------------------------------------------------
2159
+ # Blazedit Dataset Implementation
2160
+ # -----------------------------------------------------------------------------
2161
+
2162
+
2163
+ class BlazeditDataset(HuggingFaceDataset):
2164
+ """
2165
+ Blazedit Dataset.
2166
+ https://github.com/ise-uiuc/blazedit
2167
+
2168
+ 5k char version: vdaita/edit_5k_char
2169
+ 10k char version: vdaita/edit_10k_char
2170
+ """ # noqa: E501
2171
+
2172
+ # 5k char version will have output as ~5k chars
2173
+ # 10k char version will have output as ~10k chars
2174
+ # Assuming 3 char per token, 10k chars will be 3333 tokens
2175
+ # We set default to 4000 to be safe
2176
+ DEFAULT_OUTPUT_LEN = 4000
2177
+ SUPPORTED_DATASET_PATHS = {
2178
+ "vdaita/edit_5k_char",
2179
+ "vdaita/edit_10k_char",
2180
+ }
2181
+
2182
+ def sample(
2183
+ self,
2184
+ tokenizer: PreTrainedTokenizerBase,
2185
+ num_requests: int,
2186
+ output_len: Optional[int] = None,
2187
+ request_id_prefix: str = "",
2188
+ no_oversample: bool = False,
2189
+ min_distance: float = 0.0,
2190
+ max_distance: float = 1.0,
2191
+ **kwargs,
2192
+ ) -> list:
2193
+ output_len = (output_len
2194
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2195
+ sampled_requests = []
2196
+
2197
+ for i, item in enumerate(self.data):
2198
+ if len(sampled_requests) >= num_requests:
2199
+ break
2200
+ code = item["code"]
2201
+ change_request = item["change_request"]
2202
+ norm_distance = item["norm_distance"]
2203
+
2204
+ # compare the levenshtein distance normalized by code length
2205
+ if norm_distance < min_distance or norm_distance > max_distance:
2206
+ continue
2207
+
2208
+ # template copied from
2209
+ # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
2210
+ instruction = f"""Given a code file, please apply the change requests and generate the new file.
2211
+
2212
+ Original file:
2213
+ ```python
2214
+ {code}
2215
+ ```
2216
+
2217
+ Change request:
2218
+ {change_request}
2219
+
2220
+ Please generate the new code file in the "New file" section below.""" # noqa: E501
2221
+
2222
+ # apply template
2223
+ prompt = tokenizer.apply_chat_template(
2224
+ [{
2225
+ "role": "user",
2226
+ "content": instruction
2227
+ }],
2228
+ add_generation_prompt=True,
2229
+ tokenize=False,
2230
+ )
2231
+
2232
+ prompt_len = len(tokenizer(prompt).input_ids)
2233
+
2234
+ sampled_requests.append(
2235
+ SampleRequest(
2236
+ prompt=prompt,
2237
+ prompt_len=prompt_len,
2238
+ expected_output_len=output_len,
2239
+ request_id=request_id_prefix + str(i),
2240
+ ))
2241
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2242
+ request_id_prefix, no_oversample)
2243
+
2244
+ return sampled_requests
2245
+
2246
+
2247
+ # -----------------------------------------------------------------------------
2248
+ # AIMO Dataset Implementation
2249
+ # -----------------------------------------------------------------------------
2250
+
2251
+
2252
+ class AIMODataset(HuggingFaceDataset):
2253
+ """
2254
+ Dataset class for processing a AIMO dataset with reasoning questions.
2255
+ """
2256
+ SUPPORTED_DATASET_PATHS = {
2257
+ "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
2258
+ "AI-MO/NuminaMath-CoT"
2259
+ }
2260
+
2261
+ def sample(self,
2262
+ tokenizer: PreTrainedTokenizerBase,
2263
+ num_requests: int,
2264
+ output_len: Optional[int] = None,
2265
+ request_id_prefix: str = "",
2266
+ no_oversample: bool = False,
2267
+ **kwargs) -> list:
2268
+ sampled_requests = []
2269
+ ind = 0
2270
+ dynamic_output = output_len is None
2271
+
2272
+ for item in self.data:
2273
+ if len(sampled_requests) >= num_requests:
2274
+ break
2275
+ prompt, completion = item['problem'], item["solution"]
2276
+
2277
+ prompt_ids = tokenizer(prompt).input_ids
2278
+ completion_ids = tokenizer(completion).input_ids
2279
+ prompt_len = len(prompt_ids)
2280
+ completion_len = len(completion_ids)
2281
+ output_len = completion_len if dynamic_output else output_len
2282
+ assert isinstance(output_len, int) and output_len > 0
2283
+ if dynamic_output and not is_valid_sequence(prompt_len,
2284
+ completion_len,
2285
+ max_prompt_len=2048,
2286
+ max_total_len=32000):
2287
+ continue
2288
+ sampled_requests.append(
2289
+ SampleRequest(
2290
+ prompt=prompt,
2291
+ prompt_len=prompt_len,
2292
+ expected_output_len=output_len,
2293
+ multi_modal_data=None,
2294
+ request_id=request_id_prefix + str(ind),
2295
+
2296
+ ))
2297
+ ind += 1
2298
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2299
+ request_id_prefix, no_oversample)
2300
+ return sampled_requests
2301
+
2302
+
2303
+ # -----------------------------------------------------------------------------
2304
+ # Next Edit Prediction Dataset Implementation
2305
+ # -----------------------------------------------------------------------------
2306
+
2307
+
2308
+ zeta_prompt = """### Instruction:
2309
+ You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
2310
+
2311
+ ### User Edits:
2312
+
2313
+ {}
2314
+
2315
+ ### User Excerpt:
2316
+
2317
+ {}
2318
+
2319
+ ### Response:
2320
+
2321
+ """ # noqa: E501
2322
+
2323
+
2324
+ def _format_zeta_prompt(
2325
+ sample: dict,
2326
+ original_start_marker: str = "<|editable_region_start|>") -> dict:
2327
+ """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
2328
+
2329
+ This function formats examples from the NEP dataset
2330
+ into prompts and expected outputs. It could be
2331
+ further extended to support more NEP datasets.
2332
+
2333
+ Args:
2334
+ sample: The dataset sample containing events,
2335
+ inputs, and outputs.
2336
+ original_start_marker: The marker indicating the
2337
+ start of the editable region. Defaults to
2338
+ "<|editable_region_start|>".
2339
+
2340
+ Returns:
2341
+ A dictionary with the formatted prompts and expected outputs.
2342
+ """
2343
+ events = sample["events"]
2344
+ input = sample["input"]
2345
+ output = sample["output"]
2346
+ prompt = zeta_prompt.format(events, input)
2347
+
2348
+ # following the original implementation, extract the focused region
2349
+ # from the raw output
2350
+ output_start_index = output.find(original_start_marker)
2351
+ output_focused_region = output[output_start_index:]
2352
+ expected_output = output_focused_region
2353
+
2354
+ return {"prompt": prompt, "expected_output": expected_output}
2355
+
2356
+
2357
+ class NextEditPredictionDataset(HuggingFaceDataset):
2358
+ """
2359
+ Dataset class for processing a Next Edit Prediction dataset.
2360
+ """
2361
+
2362
+ SUPPORTED_DATASET_PATHS = {
2363
+ "zed-industries/zeta",
2364
+ }
2365
+ MAPPING_PROMPT_FUNCS = {
2366
+ "zed-industries/zeta": _format_zeta_prompt,
2367
+ }
2368
+
2369
+ def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
2370
+ request_id_prefix: str = "",
2371
+ no_oversample: bool = False,
2372
+ **kwargs):
2373
+ formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
2374
+ if formatting_prompt_func is None:
2375
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2376
+ samples = []
2377
+ for i, sample in enumerate(self.data):
2378
+ sample = formatting_prompt_func(sample)
2379
+ samples.append(
2380
+ SampleRequest(
2381
+ prompt=sample["prompt"],
2382
+ prompt_len=len(tokenizer(sample["prompt"]).input_ids),
2383
+ expected_output_len=len(
2384
+ tokenizer(sample["expected_output"]).input_ids),
2385
+ request_id=request_id_prefix + str(i),
2386
+ ))
2387
+ if len(samples) >= num_requests:
2388
+ break
2389
+ self.maybe_oversample_requests(samples,
2390
+ num_requests,
2391
+ request_id_prefix,
2392
+ no_oversample)
2393
+ return samples
2394
+
2395
+
2396
+ # -----------------------------------------------------------------------------
2397
+ # ASR Dataset Implementation
2398
+ # -----------------------------------------------------------------------------
2399
+
2400
+
2401
+ class ASRDataset(HuggingFaceDataset):
2402
+ """
2403
+ Dataset class for processing a ASR dataset for transcription.
2404
+ Tested on the following set:
2405
+
2406
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2407
+ | Dataset | Domain | Speaking Style | hf-subset |
2408
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2409
+ | TED-LIUM | TED talks | Oratory | release1, release2, release3|
2410
+ | | | | release3-speaker-adaptation |
2411
+ | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
2412
+ | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
2413
+ | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
2414
+ | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
2415
+ | AMI | Meetings | Spontaneous | ihm, sdm |
2416
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2417
+
2418
+ """ # noqa: E501
2419
+
2420
+ SUPPORTED_DATASET_PATHS = {
2421
+ "openslr/librispeech_asr",
2422
+ "facebook/voxpopuli",
2423
+ "LIUM/tedlium",
2424
+ "edinburghcstr/ami",
2425
+ "speechcolab/gigaspeech",
2426
+ "kensho/spgispeech",
2427
+ }
2428
+
2429
+ DEFAULT_OUTPUT_LEN = 128
2430
+ IS_MULTIMODAL = True
2431
+
2432
+ # TODO Whisper-specific. Abstract interface when more models are supported.
2433
+ TRANSCRIPTION_PREAMBLE = (
2434
+ "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>")
2435
+ skip_long_audios: bool = True
2436
+
2437
+ def sample(
2438
+ self,
2439
+ tokenizer: PreTrainedTokenizerBase,
2440
+ num_requests: int,
2441
+ output_len: Optional[int] = None,
2442
+ request_id_prefix: str = "",
2443
+ no_oversample: bool = False,
2444
+ **kwargs,
2445
+ ) -> list:
2446
+ output_len = (output_len
2447
+ if output_len is not None else self.DEFAULT_OUTPUT_LEN)
2448
+ prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
2449
+ prompt_len = len(tokenizer(prompt).input_ids)
2450
+ sampled_requests = []
2451
+ ind = 0
2452
+ skipped = 0
2453
+ for item in self.data:
2454
+ if len(sampled_requests) >= num_requests:
2455
+ break
2456
+ audio = item["audio"]
2457
+ y, sr = audio["array"], audio["sampling_rate"]
2458
+ duration_s = librosa.get_duration(y=y, sr=sr)
2459
+ # Whisper max supported duration
2460
+ if self.skip_long_audios and duration_s > 30:
2461
+ skipped += 1
2462
+ continue
2463
+
2464
+ mm_content = {"audio": (y, sr)}
2465
+ sampled_requests.append(
2466
+ SampleRequest(
2467
+ prompt=prompt,
2468
+ prompt_len=prompt_len,
2469
+ expected_output_len=output_len,
2470
+ multi_modal_data=mm_content,
2471
+ request_id=request_id_prefix + str(ind),
2472
+ ))
2473
+ ind += 1
2474
+ if skipped:
2475
+ logger.warning(
2476
+ "%d samples discarded from dataset due to"
2477
+ " their length being greater than"
2478
+ " what Whisper supports.",
2479
+ skipped,
2480
+ )
2481
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2482
+ request_id_prefix, no_oversample)
2483
+ return sampled_requests
2484
+
2485
+
2486
+ # -----------------------------------------------------------------------------
2487
+ # MLPerf Dataset Implementation
2488
+ # -----------------------------------------------------------------------------
2489
+
2490
+
2491
+ class MLPerfDataset(HuggingFaceDataset):
2492
+ """
2493
+ MLPerf Inference Dataset.
2494
+
2495
+ Dataset on HF:
2496
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data
2497
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data
2498
+
2499
+ Each record contains:
2500
+ - "system_prompt": system role instruction.
2501
+ - "question": user question.
2502
+ - "output": reference answer.
2503
+
2504
+ We combine the system prompt and question into a chat-formatted prompt
2505
+ (using the tokenizer's chat template) and set the expected output length to
2506
+ the tokenized length of the provided reference answer.
2507
+ """
2508
+
2509
+ SUPPORTED_DATASET_PATHS = {
2510
+ "mgoin/mlperf-inference-llama2-data",
2511
+ "mgoin/mlperf-inference-llama3.1-data",
2512
+ }
2513
+
2514
+ def sample(
2515
+ self,
2516
+ tokenizer: PreTrainedTokenizerBase,
2517
+ num_requests: int,
2518
+ output_len: Optional[int] = None,
2519
+ request_id_prefix: str = "",
2520
+ no_oversample: bool = False,
2521
+ **kwargs,
2522
+ ) -> list[SampleRequest]:
2523
+ # Force dynamic output length based on reference completion.
2524
+ dynamic_output = output_len is None
2525
+ sampled_requests: list[SampleRequest] = []
2526
+ ind = 0
2527
+
2528
+ for item in self.data:
2529
+ if len(sampled_requests) >= num_requests:
2530
+ break
2531
+
2532
+ system_prompt = item["system_prompt"]
2533
+ question = item["question"]
2534
+ reference_answer = item["output"]
2535
+
2536
+ # Build chat-style prompt using tokenizer template, if available.
2537
+ messages = [
2538
+ {"role": "system", "content": system_prompt},
2539
+ {"role": "user", "content": question},
2540
+ ]
2541
+ prompt_formatted = tokenizer.apply_chat_template(
2542
+ messages, add_generation_prompt=True, tokenize=False
2543
+ )
2544
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
2545
+
2546
+ # Determine output length from reference answer tokens.
2547
+ ref_out_len = len(
2548
+ tokenizer(reference_answer, add_special_tokens=False).input_ids
2549
+ )
2550
+ expected_output_len = ref_out_len if dynamic_output else output_len
2551
+
2552
+ # Validate sequence lengths.
2553
+ if not is_valid_sequence(prompt_len, expected_output_len):
2554
+ continue
2555
+
2556
+ sampled_requests.append(
2557
+ SampleRequest(
2558
+ prompt=prompt_formatted,
2559
+ prompt_len=prompt_len,
2560
+ expected_output_len=expected_output_len,
2561
+ request_id=request_id_prefix + str(ind),
2562
+ )
2563
+ )
2564
+ ind += 1
2565
+
2566
+ self.maybe_oversample_requests(sampled_requests, num_requests,
2567
+ request_id_prefix, no_oversample)
2568
+ return sampled_requests
2569
+
2570
+
2571
+ # -----------------------------------------------------------------------------
2572
+ # Prefix Repetition Dataset Implementation
2573
+ # -----------------------------------------------------------------------------
2574
+
2575
+
2576
+ class PrefixRepetitionRandomDataset(BenchmarkDataset):
2577
+ # Default values copied from benchmark_serving.py for the repeated prefix
2578
+ # dataset.
2579
+ DEFAULT_PREFIX_LEN = 256
2580
+ DEFAULT_SUFFIX_LEN = 256
2581
+ DEFAULT_NUM_PREFIXES = 10
2582
+ DEFAULT_OUTPUT_LEN = 128
2583
+
2584
+ def __init__(
2585
+ self,
2586
+ **kwargs,
2587
+ ) -> None:
2588
+ super().__init__(**kwargs)
2589
+ random.seed(self.random_seed)
2590
+ np.random.seed(self.random_seed)
2591
+
2592
+ def sample(
2593
+ self,
2594
+ tokenizer: PreTrainedTokenizerBase,
2595
+ num_requests: int,
2596
+ prefix_len: int = DEFAULT_PREFIX_LEN,
2597
+ suffix_len: int = DEFAULT_SUFFIX_LEN,
2598
+ num_prefixes: int = DEFAULT_NUM_PREFIXES,
2599
+ output_len: int = DEFAULT_OUTPUT_LEN,
2600
+ request_id_prefix: str = "",
2601
+ no_oversample: bool = False,
2602
+ **kwargs,
2603
+ ) -> list[SampleRequest]:
2604
+ vocab_size = tokenizer.vocab_size
2605
+ prompts_per_prefix = num_requests // num_prefixes
2606
+ if prompts_per_prefix == 0:
2607
+ raise ValueError(
2608
+ f"num_requests ({num_requests}) must be greater than or equal "
2609
+ f"to num_prefixes ({num_prefixes})"
2610
+ )
2611
+
2612
+ def _generate_exact_length_tokens(target_length: int) -> list[int]:
2613
+ """Generate tokens that decode and re-encode to exactly
2614
+ target_length."""
2615
+ # Generate random tokens
2616
+ tokens = np.random.randint(
2617
+ 0, vocab_size, size=target_length).tolist()
2618
+ text = tokenizer.decode(tokens)
2619
+ re_encoded = tokenizer.encode(text, add_special_tokens=False)
2620
+
2621
+ if len(re_encoded) == target_length:
2622
+ return re_encoded
2623
+ elif len(re_encoded) < target_length:
2624
+ # Recursively generate additional consistent tokens
2625
+ needed = target_length - len(re_encoded)
2626
+ extra_tokens = _generate_exact_length_tokens(needed)
2627
+ return re_encoded + extra_tokens
2628
+ else:
2629
+ # Truncate to target length
2630
+ return re_encoded[:target_length]
2631
+
2632
+ requests = []
2633
+ for _ in range(num_prefixes):
2634
+ prefix_tokens = _generate_exact_length_tokens(prefix_len)
2635
+
2636
+ for _ in range(prompts_per_prefix):
2637
+ suffix_tokens = _generate_exact_length_tokens(suffix_len)
2638
+
2639
+ combined_tokens = prefix_tokens + suffix_tokens
2640
+ prompt = tokenizer.decode(combined_tokens)
2641
+ prompt_len = len(combined_tokens)
2642
+ requests.append(
2643
+ SampleRequest(
2644
+ prompt=prompt,
2645
+ prompt_len=prompt_len,
2646
+ expected_output_len=output_len,
2647
+ )
2648
+ )
2649
+
2650
+ random.shuffle(requests)
2651
+ return requests