vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1536) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +983 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +2863 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +18 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +391 -0
  16. vllm/attention/backends/registry.py +195 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +1052 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +121 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/ops/__init__.py +0 -0
  24. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  25. vllm/attention/ops/common.py +414 -0
  26. vllm/attention/ops/flashmla.py +251 -0
  27. vllm/attention/ops/merge_attn_states.py +47 -0
  28. vllm/attention/ops/paged_attn.py +262 -0
  29. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  30. vllm/attention/ops/prefix_prefill.py +814 -0
  31. vllm/attention/ops/rocm_aiter_paged_attn.py +123 -0
  32. vllm/attention/ops/triton_decode_attention.py +712 -0
  33. vllm/attention/ops/triton_merge_attn_states.py +105 -0
  34. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  35. vllm/attention/ops/triton_unified_attention.py +941 -0
  36. vllm/attention/ops/vit_attn_wrappers.py +178 -0
  37. vllm/attention/selector.py +231 -0
  38. vllm/attention/utils/__init__.py +0 -0
  39. vllm/attention/utils/fa_utils.py +109 -0
  40. vllm/attention/utils/kv_sharing_utils.py +33 -0
  41. vllm/attention/utils/kv_transfer_utils.py +60 -0
  42. vllm/beam_search.py +88 -0
  43. vllm/benchmarks/__init__.py +0 -0
  44. vllm/benchmarks/datasets.py +3222 -0
  45. vllm/benchmarks/latency.py +172 -0
  46. vllm/benchmarks/lib/__init__.py +3 -0
  47. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  48. vllm/benchmarks/lib/ready_checker.py +72 -0
  49. vllm/benchmarks/lib/utils.py +79 -0
  50. vllm/benchmarks/serve.py +1531 -0
  51. vllm/benchmarks/sweep/__init__.py +0 -0
  52. vllm/benchmarks/sweep/cli.py +38 -0
  53. vllm/benchmarks/sweep/param_sweep.py +91 -0
  54. vllm/benchmarks/sweep/plot.py +580 -0
  55. vllm/benchmarks/sweep/serve.py +416 -0
  56. vllm/benchmarks/sweep/serve_sla.py +492 -0
  57. vllm/benchmarks/sweep/server.py +114 -0
  58. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  59. vllm/benchmarks/sweep/utils.py +4 -0
  60. vllm/benchmarks/throughput.py +799 -0
  61. vllm/collect_env.py +857 -0
  62. vllm/compilation/__init__.py +0 -0
  63. vllm/compilation/activation_quant_fusion.py +209 -0
  64. vllm/compilation/backends.py +759 -0
  65. vllm/compilation/base_static_graph.py +57 -0
  66. vllm/compilation/caching.py +178 -0
  67. vllm/compilation/collective_fusion.py +1234 -0
  68. vllm/compilation/compiler_interface.py +639 -0
  69. vllm/compilation/counter.py +48 -0
  70. vllm/compilation/cuda_graph.py +208 -0
  71. vllm/compilation/decorators.py +571 -0
  72. vllm/compilation/fix_functionalization.py +253 -0
  73. vllm/compilation/fusion.py +374 -0
  74. vllm/compilation/fusion_attn.py +359 -0
  75. vllm/compilation/fx_utils.py +91 -0
  76. vllm/compilation/inductor_pass.py +133 -0
  77. vllm/compilation/matcher_utils.py +317 -0
  78. vllm/compilation/monitor.py +62 -0
  79. vllm/compilation/noop_elimination.py +134 -0
  80. vllm/compilation/partition_rules.py +72 -0
  81. vllm/compilation/pass_manager.py +135 -0
  82. vllm/compilation/piecewise_backend.py +121 -0
  83. vllm/compilation/post_cleanup.py +21 -0
  84. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  85. vllm/compilation/sequence_parallelism.py +363 -0
  86. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  87. vllm/compilation/vllm_inductor_pass.py +173 -0
  88. vllm/compilation/wrapper.py +238 -0
  89. vllm/config/__init__.py +102 -0
  90. vllm/config/cache.py +207 -0
  91. vllm/config/compilation.py +975 -0
  92. vllm/config/device.py +75 -0
  93. vllm/config/ec_transfer.py +110 -0
  94. vllm/config/kv_events.py +56 -0
  95. vllm/config/kv_transfer.py +114 -0
  96. vllm/config/load.py +124 -0
  97. vllm/config/lora.py +112 -0
  98. vllm/config/model.py +2162 -0
  99. vllm/config/multimodal.py +248 -0
  100. vllm/config/observability.py +123 -0
  101. vllm/config/parallel.py +655 -0
  102. vllm/config/pooler.py +122 -0
  103. vllm/config/scheduler.py +298 -0
  104. vllm/config/speculative.py +654 -0
  105. vllm/config/speech_to_text.py +38 -0
  106. vllm/config/structured_outputs.py +92 -0
  107. vllm/config/utils.py +178 -0
  108. vllm/config/vllm.py +1166 -0
  109. vllm/connections.py +189 -0
  110. vllm/device_allocator/__init__.py +0 -0
  111. vllm/device_allocator/cumem.py +327 -0
  112. vllm/distributed/__init__.py +6 -0
  113. vllm/distributed/communication_op.py +43 -0
  114. vllm/distributed/device_communicators/__init__.py +0 -0
  115. vllm/distributed/device_communicators/all2all.py +490 -0
  116. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  117. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  118. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  119. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  120. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  121. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  122. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  123. vllm/distributed/device_communicators/pynccl.py +386 -0
  124. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  125. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  126. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  127. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  128. vllm/distributed/device_communicators/shm_broadcast.py +733 -0
  129. vllm/distributed/device_communicators/shm_object_storage.py +660 -0
  130. vllm/distributed/device_communicators/symm_mem.py +156 -0
  131. vllm/distributed/device_communicators/tpu_communicator.py +107 -0
  132. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  133. vllm/distributed/ec_transfer/__init__.py +14 -0
  134. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  135. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  136. vllm/distributed/ec_transfer/ec_connector/factory.py +88 -0
  137. vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +201 -0
  138. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  139. vllm/distributed/eplb/__init__.py +8 -0
  140. vllm/distributed/eplb/eplb_state.py +837 -0
  141. vllm/distributed/eplb/rebalance_algo.py +260 -0
  142. vllm/distributed/eplb/rebalance_execute.py +431 -0
  143. vllm/distributed/kv_events.py +371 -0
  144. vllm/distributed/kv_transfer/README.md +29 -0
  145. vllm/distributed/kv_transfer/__init__.py +20 -0
  146. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  147. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  148. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  149. vllm/distributed/kv_transfer/kv_connector/factory.py +192 -0
  150. vllm/distributed/kv_transfer/kv_connector/utils.py +268 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/base.py +546 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +216 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +379 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1411 -0
  159. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +867 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +189 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +454 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2440 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +504 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +450 -0
  169. vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py +0 -0
  170. vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +179 -0
  171. vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +164 -0
  172. vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +242 -0
  173. vllm/distributed/kv_transfer/kv_pipe/__init__.py +0 -0
  174. vllm/distributed/kv_transfer/kv_pipe/base.py +66 -0
  175. vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +295 -0
  176. vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +285 -0
  177. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  178. vllm/distributed/parallel_state.py +1759 -0
  179. vllm/distributed/tpu_distributed_utils.py +188 -0
  180. vllm/distributed/utils.py +543 -0
  181. vllm/engine/__init__.py +0 -0
  182. vllm/engine/arg_utils.py +2144 -0
  183. vllm/engine/async_llm_engine.py +6 -0
  184. vllm/engine/llm_engine.py +6 -0
  185. vllm/engine/protocol.py +170 -0
  186. vllm/entrypoints/__init__.py +0 -0
  187. vllm/entrypoints/anthropic/__init__.py +0 -0
  188. vllm/entrypoints/anthropic/protocol.py +162 -0
  189. vllm/entrypoints/anthropic/serving_messages.py +460 -0
  190. vllm/entrypoints/api_server.py +184 -0
  191. vllm/entrypoints/chat_utils.py +1690 -0
  192. vllm/entrypoints/cli/__init__.py +13 -0
  193. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  194. vllm/entrypoints/cli/benchmark/base.py +25 -0
  195. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  196. vllm/entrypoints/cli/benchmark/main.py +56 -0
  197. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  198. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  199. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  200. vllm/entrypoints/cli/collect_env.py +38 -0
  201. vllm/entrypoints/cli/main.py +79 -0
  202. vllm/entrypoints/cli/openai.py +256 -0
  203. vllm/entrypoints/cli/run_batch.py +68 -0
  204. vllm/entrypoints/cli/serve.py +249 -0
  205. vllm/entrypoints/cli/types.py +29 -0
  206. vllm/entrypoints/constants.py +10 -0
  207. vllm/entrypoints/context.py +572 -0
  208. vllm/entrypoints/dynamic_lora.py +57 -0
  209. vllm/entrypoints/harmony_utils.py +535 -0
  210. vllm/entrypoints/launcher.py +175 -0
  211. vllm/entrypoints/llm.py +1768 -0
  212. vllm/entrypoints/logger.py +84 -0
  213. vllm/entrypoints/openai/__init__.py +0 -0
  214. vllm/entrypoints/openai/api_server.py +2096 -0
  215. vllm/entrypoints/openai/cli_args.py +302 -0
  216. vllm/entrypoints/openai/orca_metrics.py +120 -0
  217. vllm/entrypoints/openai/protocol.py +3299 -0
  218. vllm/entrypoints/openai/run_batch.py +547 -0
  219. vllm/entrypoints/openai/serving_chat.py +1772 -0
  220. vllm/entrypoints/openai/serving_classification.py +235 -0
  221. vllm/entrypoints/openai/serving_completion.py +715 -0
  222. vllm/entrypoints/openai/serving_embedding.py +695 -0
  223. vllm/entrypoints/openai/serving_engine.py +1433 -0
  224. vllm/entrypoints/openai/serving_models.py +304 -0
  225. vllm/entrypoints/openai/serving_pooling.py +346 -0
  226. vllm/entrypoints/openai/serving_responses.py +2021 -0
  227. vllm/entrypoints/openai/serving_score.py +503 -0
  228. vllm/entrypoints/openai/serving_tokenization.py +203 -0
  229. vllm/entrypoints/openai/serving_tokens.py +269 -0
  230. vllm/entrypoints/openai/serving_transcription.py +148 -0
  231. vllm/entrypoints/openai/speech_to_text.py +405 -0
  232. vllm/entrypoints/openai/tool_parsers/__init__.py +142 -0
  233. vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +273 -0
  234. vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +390 -0
  235. vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +390 -0
  236. vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +210 -0
  237. vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +200 -0
  238. vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  239. vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +253 -0
  240. vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +494 -0
  241. vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  242. vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +227 -0
  243. vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +323 -0
  244. vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +590 -0
  245. vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  246. vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +290 -0
  247. vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +37 -0
  248. vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +643 -0
  249. vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +849 -0
  250. vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +390 -0
  251. vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py +366 -0
  252. vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +97 -0
  253. vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +120 -0
  254. vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +332 -0
  255. vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +781 -0
  256. vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  257. vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +744 -0
  258. vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +303 -0
  259. vllm/entrypoints/openai/tool_parsers/utils.py +229 -0
  260. vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +556 -0
  261. vllm/entrypoints/renderer.py +409 -0
  262. vllm/entrypoints/responses_utils.py +77 -0
  263. vllm/entrypoints/sagemaker/__init__.py +4 -0
  264. vllm/entrypoints/sagemaker/routes.py +72 -0
  265. vllm/entrypoints/score_utils.py +242 -0
  266. vllm/entrypoints/ssl.py +78 -0
  267. vllm/entrypoints/tool.py +143 -0
  268. vllm/entrypoints/tool_server.py +209 -0
  269. vllm/entrypoints/utils.py +319 -0
  270. vllm/env_override.py +378 -0
  271. vllm/envs.py +1659 -0
  272. vllm/forward_context.py +356 -0
  273. vllm/inputs/__init__.py +44 -0
  274. vllm/inputs/data.py +359 -0
  275. vllm/inputs/parse.py +137 -0
  276. vllm/inputs/preprocess.py +727 -0
  277. vllm/logger.py +267 -0
  278. vllm/logging_utils/__init__.py +10 -0
  279. vllm/logging_utils/dump_input.py +83 -0
  280. vllm/logging_utils/formatter.py +77 -0
  281. vllm/logging_utils/log_time.py +34 -0
  282. vllm/logits_process.py +121 -0
  283. vllm/logprobs.py +208 -0
  284. vllm/lora/__init__.py +0 -0
  285. vllm/lora/layers/__init__.py +41 -0
  286. vllm/lora/layers/base.py +67 -0
  287. vllm/lora/layers/base_linear.py +164 -0
  288. vllm/lora/layers/column_parallel_linear.py +578 -0
  289. vllm/lora/layers/fused_moe.py +472 -0
  290. vllm/lora/layers/logits_processor.py +252 -0
  291. vllm/lora/layers/replicated_linear.py +70 -0
  292. vllm/lora/layers/row_parallel_linear.py +181 -0
  293. vllm/lora/layers/utils.py +65 -0
  294. vllm/lora/layers/vocal_parallel_embedding.py +166 -0
  295. vllm/lora/lora_weights.py +198 -0
  296. vllm/lora/models.py +890 -0
  297. vllm/lora/ops/__init__.py +0 -0
  298. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  299. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  300. vllm/lora/ops/torch_ops/__init__.py +20 -0
  301. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  302. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  303. vllm/lora/ops/triton_ops/__init__.py +21 -0
  304. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +641 -0
  305. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  306. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  307. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  308. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  309. vllm/lora/ops/triton_ops/utils.py +295 -0
  310. vllm/lora/ops/xla_ops/__init__.py +6 -0
  311. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  312. vllm/lora/peft_helper.py +128 -0
  313. vllm/lora/punica_wrapper/__init__.py +10 -0
  314. vllm/lora/punica_wrapper/punica_base.py +492 -0
  315. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  316. vllm/lora/punica_wrapper/punica_gpu.py +411 -0
  317. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  318. vllm/lora/punica_wrapper/punica_tpu.py +359 -0
  319. vllm/lora/punica_wrapper/punica_xpu.py +279 -0
  320. vllm/lora/punica_wrapper/utils.py +150 -0
  321. vllm/lora/request.py +100 -0
  322. vllm/lora/resolver.py +88 -0
  323. vllm/lora/utils.py +293 -0
  324. vllm/lora/worker_manager.py +279 -0
  325. vllm/model_executor/__init__.py +11 -0
  326. vllm/model_executor/custom_op.py +194 -0
  327. vllm/model_executor/layers/__init__.py +0 -0
  328. vllm/model_executor/layers/activation.py +569 -0
  329. vllm/model_executor/layers/attention_layer_base.py +35 -0
  330. vllm/model_executor/layers/batch_invariant.py +854 -0
  331. vllm/model_executor/layers/conv.py +236 -0
  332. vllm/model_executor/layers/fla/__init__.py +8 -0
  333. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  334. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  335. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  336. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  337. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  338. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  339. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  340. vllm/model_executor/layers/fla/ops/index.py +41 -0
  341. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  342. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  343. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  344. vllm/model_executor/layers/fla/ops/op.py +60 -0
  345. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  346. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  347. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  348. vllm/model_executor/layers/fused_moe/__init__.py +106 -0
  349. vllm/model_executor/layers/fused_moe/all2all_utils.py +160 -0
  350. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +406 -0
  351. vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py +180 -0
  352. vllm/model_executor/layers/fused_moe/config.py +916 -0
  353. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  354. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  355. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  356. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  357. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  358. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  359. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  360. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  361. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  362. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  363. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  624. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  625. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +354 -0
  626. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1052 -0
  627. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +387 -0
  628. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +416 -0
  629. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  630. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +367 -0
  631. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  632. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  633. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  634. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  635. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +792 -0
  636. vllm/model_executor/layers/fused_moe/fused_moe.py +2175 -0
  637. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +112 -0
  638. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +164 -0
  639. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +316 -0
  640. vllm/model_executor/layers/fused_moe/layer.py +1944 -0
  641. vllm/model_executor/layers/fused_moe/modular_kernel.py +1222 -0
  642. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +174 -0
  643. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  644. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  645. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  646. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  647. vllm/model_executor/layers/fused_moe/prepare_finalize.py +77 -0
  648. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  649. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  650. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +97 -0
  651. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  652. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  653. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  654. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +578 -0
  655. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  656. vllm/model_executor/layers/kda.py +448 -0
  657. vllm/model_executor/layers/layernorm.py +442 -0
  658. vllm/model_executor/layers/lightning_attn.py +729 -0
  659. vllm/model_executor/layers/linear.py +1424 -0
  660. vllm/model_executor/layers/logits_processor.py +106 -0
  661. vllm/model_executor/layers/mamba/__init__.py +0 -0
  662. vllm/model_executor/layers/mamba/abstract.py +71 -0
  663. vllm/model_executor/layers/mamba/linear_attn.py +402 -0
  664. vllm/model_executor/layers/mamba/mamba_mixer.py +535 -0
  665. vllm/model_executor/layers/mamba/mamba_mixer2.py +928 -0
  666. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  667. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  668. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  669. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  670. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +478 -0
  671. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  672. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  673. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  674. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  675. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  676. vllm/model_executor/layers/mamba/short_conv.py +264 -0
  677. vllm/model_executor/layers/mla.py +168 -0
  678. vllm/model_executor/layers/pooler.py +817 -0
  679. vllm/model_executor/layers/quantization/__init__.py +174 -0
  680. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  681. vllm/model_executor/layers/quantization/awq.py +277 -0
  682. vllm/model_executor/layers/quantization/awq_marlin.py +659 -0
  683. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  684. vllm/model_executor/layers/quantization/base_config.py +170 -0
  685. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  686. vllm/model_executor/layers/quantization/bitsandbytes.py +658 -0
  687. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  688. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +914 -0
  689. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2284 -0
  690. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  691. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  692. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  693. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  694. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  695. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  696. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +183 -0
  697. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  698. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  699. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  700. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  701. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +219 -0
  702. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  703. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  704. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  705. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  706. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  707. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  710. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  711. vllm/model_executor/layers/quantization/experts_int8.py +240 -0
  712. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  713. vllm/model_executor/layers/quantization/fp8.py +1333 -0
  714. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  715. vllm/model_executor/layers/quantization/gguf.py +643 -0
  716. vllm/model_executor/layers/quantization/gptq.py +393 -0
  717. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  718. vllm/model_executor/layers/quantization/gptq_marlin.py +789 -0
  719. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  720. vllm/model_executor/layers/quantization/hqq_marlin.py +371 -0
  721. vllm/model_executor/layers/quantization/inc.py +65 -0
  722. vllm/model_executor/layers/quantization/input_quant_fp8.py +171 -0
  723. vllm/model_executor/layers/quantization/ipex_quant.py +467 -0
  724. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  725. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  726. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +105 -0
  727. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  728. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  729. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  730. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +119 -0
  731. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  732. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  733. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  734. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +166 -0
  735. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +73 -0
  736. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +97 -0
  737. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +120 -0
  738. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +219 -0
  739. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +140 -0
  740. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +42 -0
  741. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +105 -0
  742. vllm/model_executor/layers/quantization/kv_cache.py +146 -0
  743. vllm/model_executor/layers/quantization/modelopt.py +1788 -0
  744. vllm/model_executor/layers/quantization/moe_wna16.py +541 -0
  745. vllm/model_executor/layers/quantization/mxfp4.py +1162 -0
  746. vllm/model_executor/layers/quantization/petit.py +320 -0
  747. vllm/model_executor/layers/quantization/ptpc_fp8.py +137 -0
  748. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  749. vllm/model_executor/layers/quantization/quark/quark.py +528 -0
  750. vllm/model_executor/layers/quantization/quark/quark_moe.py +683 -0
  751. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  752. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +306 -0
  753. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  754. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  755. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  756. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  757. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  758. vllm/model_executor/layers/quantization/rtn.py +652 -0
  759. vllm/model_executor/layers/quantization/schema.py +90 -0
  760. vllm/model_executor/layers/quantization/torchao.py +380 -0
  761. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  762. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  763. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  764. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  765. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  766. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  767. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  768. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  769. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  770. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  771. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  772. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  773. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  774. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  775. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  776. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  777. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  778. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  779. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  780. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  781. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  976. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +89 -0
  977. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +298 -0
  978. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1203 -0
  979. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  980. vllm/model_executor/layers/quantization/utils/int8_utils.py +489 -0
  981. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  982. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  983. vllm/model_executor/layers/quantization/utils/marlin_utils.py +575 -0
  984. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +397 -0
  985. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +351 -0
  986. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +161 -0
  987. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  988. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +181 -0
  989. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  990. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  991. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  992. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +63 -0
  993. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  994. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  995. vllm/model_executor/layers/quantization/utils/quant_utils.py +687 -0
  996. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +516 -0
  997. vllm/model_executor/layers/resampler.py +283 -0
  998. vllm/model_executor/layers/rotary_embedding/__init__.py +278 -0
  999. vllm/model_executor/layers/rotary_embedding/base.py +235 -0
  1000. vllm/model_executor/layers/rotary_embedding/common.py +188 -0
  1001. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1002. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1003. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1004. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1005. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +75 -0
  1006. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1007. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1008. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1009. vllm/model_executor/layers/rotary_embedding/mrope.py +397 -0
  1010. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1011. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1012. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +81 -0
  1013. vllm/model_executor/layers/utils.py +251 -0
  1014. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1015. vllm/model_executor/model_loader/__init__.py +148 -0
  1016. vllm/model_executor/model_loader/base_loader.py +57 -0
  1017. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1018. vllm/model_executor/model_loader/default_loader.py +327 -0
  1019. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1020. vllm/model_executor/model_loader/gguf_loader.py +176 -0
  1021. vllm/model_executor/model_loader/online_quantization.py +224 -0
  1022. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1023. vllm/model_executor/model_loader/sharded_state_loader.py +206 -0
  1024. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1025. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1026. vllm/model_executor/model_loader/tpu.py +118 -0
  1027. vllm/model_executor/model_loader/utils.py +288 -0
  1028. vllm/model_executor/model_loader/weight_utils.py +1084 -0
  1029. vllm/model_executor/models/__init__.py +44 -0
  1030. vllm/model_executor/models/adapters.py +543 -0
  1031. vllm/model_executor/models/afmoe.py +711 -0
  1032. vllm/model_executor/models/aimv2.py +247 -0
  1033. vllm/model_executor/models/apertus.py +587 -0
  1034. vllm/model_executor/models/arcee.py +439 -0
  1035. vllm/model_executor/models/arctic.py +635 -0
  1036. vllm/model_executor/models/aria.py +655 -0
  1037. vllm/model_executor/models/aya_vision.py +450 -0
  1038. vllm/model_executor/models/baichuan.py +496 -0
  1039. vllm/model_executor/models/bailing_moe.py +646 -0
  1040. vllm/model_executor/models/bamba.py +522 -0
  1041. vllm/model_executor/models/bee.py +157 -0
  1042. vllm/model_executor/models/bert.py +925 -0
  1043. vllm/model_executor/models/bert_with_rope.py +732 -0
  1044. vllm/model_executor/models/blip.py +349 -0
  1045. vllm/model_executor/models/blip2.py +695 -0
  1046. vllm/model_executor/models/bloom.py +390 -0
  1047. vllm/model_executor/models/chameleon.py +1120 -0
  1048. vllm/model_executor/models/chatglm.py +498 -0
  1049. vllm/model_executor/models/clip.py +965 -0
  1050. vllm/model_executor/models/cohere2_vision.py +472 -0
  1051. vllm/model_executor/models/commandr.py +473 -0
  1052. vllm/model_executor/models/config.py +503 -0
  1053. vllm/model_executor/models/dbrx.py +482 -0
  1054. vllm/model_executor/models/deepencoder.py +673 -0
  1055. vllm/model_executor/models/deepseek_eagle.py +260 -0
  1056. vllm/model_executor/models/deepseek_mtp.py +360 -0
  1057. vllm/model_executor/models/deepseek_ocr.py +593 -0
  1058. vllm/model_executor/models/deepseek_v2.py +1649 -0
  1059. vllm/model_executor/models/deepseek_vl2.py +655 -0
  1060. vllm/model_executor/models/dots1.py +574 -0
  1061. vllm/model_executor/models/dots_ocr.py +900 -0
  1062. vllm/model_executor/models/ernie45.py +53 -0
  1063. vllm/model_executor/models/ernie45_moe.py +759 -0
  1064. vllm/model_executor/models/ernie45_vl.py +1742 -0
  1065. vllm/model_executor/models/ernie45_vl_moe.py +803 -0
  1066. vllm/model_executor/models/ernie_mtp.py +279 -0
  1067. vllm/model_executor/models/exaone.py +545 -0
  1068. vllm/model_executor/models/exaone4.py +531 -0
  1069. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1070. vllm/model_executor/models/falcon.py +545 -0
  1071. vllm/model_executor/models/falcon_h1.py +685 -0
  1072. vllm/model_executor/models/flex_olmo.py +155 -0
  1073. vllm/model_executor/models/fuyu.py +373 -0
  1074. vllm/model_executor/models/gemma.py +426 -0
  1075. vllm/model_executor/models/gemma2.py +439 -0
  1076. vllm/model_executor/models/gemma3.py +571 -0
  1077. vllm/model_executor/models/gemma3_mm.py +741 -0
  1078. vllm/model_executor/models/gemma3n.py +1165 -0
  1079. vllm/model_executor/models/gemma3n_mm.py +811 -0
  1080. vllm/model_executor/models/glm.py +23 -0
  1081. vllm/model_executor/models/glm4.py +305 -0
  1082. vllm/model_executor/models/glm4_1v.py +1821 -0
  1083. vllm/model_executor/models/glm4_moe.py +747 -0
  1084. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1085. vllm/model_executor/models/glm4v.py +784 -0
  1086. vllm/model_executor/models/gpt2.py +397 -0
  1087. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1088. vllm/model_executor/models/gpt_j.py +346 -0
  1089. vllm/model_executor/models/gpt_neox.py +344 -0
  1090. vllm/model_executor/models/gpt_oss.py +738 -0
  1091. vllm/model_executor/models/granite.py +516 -0
  1092. vllm/model_executor/models/granite_speech.py +913 -0
  1093. vllm/model_executor/models/granitemoe.py +569 -0
  1094. vllm/model_executor/models/granitemoehybrid.py +709 -0
  1095. vllm/model_executor/models/granitemoeshared.py +333 -0
  1096. vllm/model_executor/models/gritlm.py +245 -0
  1097. vllm/model_executor/models/grok1.py +558 -0
  1098. vllm/model_executor/models/h2ovl.py +554 -0
  1099. vllm/model_executor/models/hunyuan_v1.py +1053 -0
  1100. vllm/model_executor/models/hyperclovax_vision.py +1166 -0
  1101. vllm/model_executor/models/idefics2_vision_model.py +426 -0
  1102. vllm/model_executor/models/idefics3.py +717 -0
  1103. vllm/model_executor/models/interfaces.py +1092 -0
  1104. vllm/model_executor/models/interfaces_base.py +214 -0
  1105. vllm/model_executor/models/intern_vit.py +453 -0
  1106. vllm/model_executor/models/internlm2.py +460 -0
  1107. vllm/model_executor/models/internlm2_ve.py +142 -0
  1108. vllm/model_executor/models/interns1.py +830 -0
  1109. vllm/model_executor/models/interns1_vit.py +432 -0
  1110. vllm/model_executor/models/internvl.py +1452 -0
  1111. vllm/model_executor/models/jais.py +397 -0
  1112. vllm/model_executor/models/jamba.py +610 -0
  1113. vllm/model_executor/models/jina_vl.py +147 -0
  1114. vllm/model_executor/models/keye.py +1761 -0
  1115. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1116. vllm/model_executor/models/kimi_linear.py +663 -0
  1117. vllm/model_executor/models/kimi_vl.py +578 -0
  1118. vllm/model_executor/models/lfm2.py +532 -0
  1119. vllm/model_executor/models/lfm2_moe.py +762 -0
  1120. vllm/model_executor/models/lightonocr.py +195 -0
  1121. vllm/model_executor/models/llama.py +732 -0
  1122. vllm/model_executor/models/llama4.py +859 -0
  1123. vllm/model_executor/models/llama4_eagle.py +223 -0
  1124. vllm/model_executor/models/llama_eagle.py +218 -0
  1125. vllm/model_executor/models/llama_eagle3.py +367 -0
  1126. vllm/model_executor/models/llava.py +842 -0
  1127. vllm/model_executor/models/llava_next.py +583 -0
  1128. vllm/model_executor/models/llava_next_video.py +467 -0
  1129. vllm/model_executor/models/llava_onevision.py +923 -0
  1130. vllm/model_executor/models/longcat_flash.py +749 -0
  1131. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1132. vllm/model_executor/models/mamba.py +276 -0
  1133. vllm/model_executor/models/mamba2.py +289 -0
  1134. vllm/model_executor/models/medusa.py +179 -0
  1135. vllm/model_executor/models/midashenglm.py +827 -0
  1136. vllm/model_executor/models/mimo.py +188 -0
  1137. vllm/model_executor/models/mimo_mtp.py +294 -0
  1138. vllm/model_executor/models/minicpm.py +664 -0
  1139. vllm/model_executor/models/minicpm3.py +242 -0
  1140. vllm/model_executor/models/minicpm_eagle.py +389 -0
  1141. vllm/model_executor/models/minicpmo.py +768 -0
  1142. vllm/model_executor/models/minicpmv.py +1745 -0
  1143. vllm/model_executor/models/minimax_m2.py +552 -0
  1144. vllm/model_executor/models/minimax_text_01.py +1012 -0
  1145. vllm/model_executor/models/minimax_vl_01.py +396 -0
  1146. vllm/model_executor/models/mistral3.py +637 -0
  1147. vllm/model_executor/models/mixtral.py +621 -0
  1148. vllm/model_executor/models/mllama4.py +1147 -0
  1149. vllm/model_executor/models/mlp_speculator.py +235 -0
  1150. vllm/model_executor/models/modernbert.py +450 -0
  1151. vllm/model_executor/models/module_mapping.py +74 -0
  1152. vllm/model_executor/models/molmo.py +1555 -0
  1153. vllm/model_executor/models/moonvit.py +677 -0
  1154. vllm/model_executor/models/mpt.py +335 -0
  1155. vllm/model_executor/models/nano_nemotron_vl.py +1740 -0
  1156. vllm/model_executor/models/nemotron.py +518 -0
  1157. vllm/model_executor/models/nemotron_h.py +852 -0
  1158. vllm/model_executor/models/nemotron_nas.py +491 -0
  1159. vllm/model_executor/models/nemotron_vl.py +653 -0
  1160. vllm/model_executor/models/nvlm_d.py +216 -0
  1161. vllm/model_executor/models/olmo.py +414 -0
  1162. vllm/model_executor/models/olmo2.py +454 -0
  1163. vllm/model_executor/models/olmoe.py +498 -0
  1164. vllm/model_executor/models/openpangu.py +1062 -0
  1165. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1166. vllm/model_executor/models/opt.py +426 -0
  1167. vllm/model_executor/models/orion.py +372 -0
  1168. vllm/model_executor/models/ouro.py +516 -0
  1169. vllm/model_executor/models/ovis.py +559 -0
  1170. vllm/model_executor/models/ovis2_5.py +673 -0
  1171. vllm/model_executor/models/paddleocr_vl.py +1407 -0
  1172. vllm/model_executor/models/paligemma.py +412 -0
  1173. vllm/model_executor/models/persimmon.py +377 -0
  1174. vllm/model_executor/models/phi.py +374 -0
  1175. vllm/model_executor/models/phi3.py +18 -0
  1176. vllm/model_executor/models/phi3v.py +737 -0
  1177. vllm/model_executor/models/phi4_multimodal.py +1447 -0
  1178. vllm/model_executor/models/phi4mm.py +1253 -0
  1179. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1180. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1181. vllm/model_executor/models/phimoe.py +675 -0
  1182. vllm/model_executor/models/pixtral.py +1352 -0
  1183. vllm/model_executor/models/plamo2.py +981 -0
  1184. vllm/model_executor/models/qwen.py +368 -0
  1185. vllm/model_executor/models/qwen2.py +541 -0
  1186. vllm/model_executor/models/qwen2_5_omni_thinker.py +1246 -0
  1187. vllm/model_executor/models/qwen2_5_vl.py +1613 -0
  1188. vllm/model_executor/models/qwen2_audio.py +473 -0
  1189. vllm/model_executor/models/qwen2_moe.py +596 -0
  1190. vllm/model_executor/models/qwen2_rm.py +123 -0
  1191. vllm/model_executor/models/qwen2_vl.py +1670 -0
  1192. vllm/model_executor/models/qwen3.py +336 -0
  1193. vllm/model_executor/models/qwen3_moe.py +744 -0
  1194. vllm/model_executor/models/qwen3_next.py +1395 -0
  1195. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1196. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1721 -0
  1197. vllm/model_executor/models/qwen3_vl.py +1673 -0
  1198. vllm/model_executor/models/qwen3_vl_moe.py +415 -0
  1199. vllm/model_executor/models/qwen_vl.py +802 -0
  1200. vllm/model_executor/models/radio.py +555 -0
  1201. vllm/model_executor/models/registry.py +1155 -0
  1202. vllm/model_executor/models/roberta.py +259 -0
  1203. vllm/model_executor/models/rvl.py +107 -0
  1204. vllm/model_executor/models/seed_oss.py +497 -0
  1205. vllm/model_executor/models/siglip.py +1174 -0
  1206. vllm/model_executor/models/siglip2navit.py +724 -0
  1207. vllm/model_executor/models/skyworkr1v.py +953 -0
  1208. vllm/model_executor/models/smolvlm.py +38 -0
  1209. vllm/model_executor/models/solar.py +502 -0
  1210. vllm/model_executor/models/stablelm.py +359 -0
  1211. vllm/model_executor/models/starcoder2.py +367 -0
  1212. vllm/model_executor/models/step3_text.py +559 -0
  1213. vllm/model_executor/models/step3_vl.py +1148 -0
  1214. vllm/model_executor/models/swin.py +514 -0
  1215. vllm/model_executor/models/tarsier.py +619 -0
  1216. vllm/model_executor/models/telechat2.py +153 -0
  1217. vllm/model_executor/models/teleflm.py +78 -0
  1218. vllm/model_executor/models/terratorch.py +319 -0
  1219. vllm/model_executor/models/transformers/__init__.py +127 -0
  1220. vllm/model_executor/models/transformers/base.py +464 -0
  1221. vllm/model_executor/models/transformers/causal.py +65 -0
  1222. vllm/model_executor/models/transformers/legacy.py +90 -0
  1223. vllm/model_executor/models/transformers/moe.py +318 -0
  1224. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1225. vllm/model_executor/models/transformers/pooling.py +119 -0
  1226. vllm/model_executor/models/transformers/utils.py +207 -0
  1227. vllm/model_executor/models/ultravox.py +681 -0
  1228. vllm/model_executor/models/utils.py +877 -0
  1229. vllm/model_executor/models/vision.py +552 -0
  1230. vllm/model_executor/models/voxtral.py +845 -0
  1231. vllm/model_executor/models/whisper.py +959 -0
  1232. vllm/model_executor/models/zamba2.py +986 -0
  1233. vllm/model_executor/parameter.py +642 -0
  1234. vllm/model_executor/utils.py +94 -0
  1235. vllm/model_executor/warmup/__init__.py +0 -0
  1236. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1237. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1238. vllm/multimodal/__init__.py +40 -0
  1239. vllm/multimodal/audio.py +118 -0
  1240. vllm/multimodal/base.py +26 -0
  1241. vllm/multimodal/cache.py +755 -0
  1242. vllm/multimodal/evs.py +294 -0
  1243. vllm/multimodal/hasher.py +106 -0
  1244. vllm/multimodal/image.py +130 -0
  1245. vllm/multimodal/inputs.py +1036 -0
  1246. vllm/multimodal/parse.py +544 -0
  1247. vllm/multimodal/processing.py +2186 -0
  1248. vllm/multimodal/profiling.py +369 -0
  1249. vllm/multimodal/registry.py +360 -0
  1250. vllm/multimodal/utils.py +512 -0
  1251. vllm/multimodal/video.py +306 -0
  1252. vllm/outputs.py +345 -0
  1253. vllm/platforms/__init__.py +277 -0
  1254. vllm/platforms/cpu.py +414 -0
  1255. vllm/platforms/cuda.py +657 -0
  1256. vllm/platforms/interface.py +639 -0
  1257. vllm/platforms/rocm.py +466 -0
  1258. vllm/platforms/tpu.py +276 -0
  1259. vllm/platforms/xpu.py +274 -0
  1260. vllm/plugins/__init__.py +78 -0
  1261. vllm/plugins/io_processors/__init__.py +68 -0
  1262. vllm/plugins/io_processors/interface.py +77 -0
  1263. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1264. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1265. vllm/pooling_params.py +228 -0
  1266. vllm/profiler/__init__.py +0 -0
  1267. vllm/profiler/gpu_profiler.py +37 -0
  1268. vllm/profiler/layerwise_profile.py +392 -0
  1269. vllm/profiler/utils.py +151 -0
  1270. vllm/py.typed +2 -0
  1271. vllm/ray/__init__.py +0 -0
  1272. vllm/ray/lazy_utils.py +26 -0
  1273. vllm/ray/ray_env.py +79 -0
  1274. vllm/reasoning/__init__.py +92 -0
  1275. vllm/reasoning/abs_reasoning_parsers.py +290 -0
  1276. vllm/reasoning/basic_parsers.py +162 -0
  1277. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1278. vllm/reasoning/deepseek_v3_reasoning_parser.py +62 -0
  1279. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1280. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1281. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1282. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1283. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1284. vllm/reasoning/identity_reasoning_parser.py +58 -0
  1285. vllm/reasoning/minimax_m2_reasoning_parser.py +67 -0
  1286. vllm/reasoning/mistral_reasoning_parser.py +55 -0
  1287. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1288. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1289. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1290. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1291. vllm/sampling_params.py +669 -0
  1292. vllm/scalar_type.py +355 -0
  1293. vllm/scripts.py +17 -0
  1294. vllm/sequence.py +98 -0
  1295. vllm/tasks.py +13 -0
  1296. vllm/third_party/__init__.py +0 -0
  1297. vllm/third_party/pynvml.py +6140 -0
  1298. vllm/tracing.py +135 -0
  1299. vllm/transformers_utils/__init__.py +26 -0
  1300. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1301. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1302. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1303. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1304. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1305. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1306. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1307. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1308. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1309. vllm/transformers_utils/config.py +1203 -0
  1310. vllm/transformers_utils/config_parser_base.py +20 -0
  1311. vllm/transformers_utils/configs/__init__.py +70 -0
  1312. vllm/transformers_utils/configs/afmoe.py +84 -0
  1313. vllm/transformers_utils/configs/arctic.py +206 -0
  1314. vllm/transformers_utils/configs/chatglm.py +75 -0
  1315. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1316. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1317. vllm/transformers_utils/configs/eagle.py +84 -0
  1318. vllm/transformers_utils/configs/falcon.py +89 -0
  1319. vllm/transformers_utils/configs/flex_olmo.py +77 -0
  1320. vllm/transformers_utils/configs/jais.py +243 -0
  1321. vllm/transformers_utils/configs/kimi_linear.py +144 -0
  1322. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1323. vllm/transformers_utils/configs/lfm2_moe.py +159 -0
  1324. vllm/transformers_utils/configs/medusa.py +65 -0
  1325. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1326. vllm/transformers_utils/configs/mistral.py +174 -0
  1327. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1328. vllm/transformers_utils/configs/moonvit.py +33 -0
  1329. vllm/transformers_utils/configs/nemotron.py +212 -0
  1330. vllm/transformers_utils/configs/nemotron_h.py +282 -0
  1331. vllm/transformers_utils/configs/olmo3.py +79 -0
  1332. vllm/transformers_utils/configs/ovis.py +182 -0
  1333. vllm/transformers_utils/configs/qwen3_next.py +274 -0
  1334. vllm/transformers_utils/configs/radio.py +89 -0
  1335. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1336. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1337. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1338. vllm/transformers_utils/configs/step3_vl.py +174 -0
  1339. vllm/transformers_utils/configs/ultravox.py +118 -0
  1340. vllm/transformers_utils/detokenizer_utils.py +198 -0
  1341. vllm/transformers_utils/dynamic_module.py +59 -0
  1342. vllm/transformers_utils/processor.py +402 -0
  1343. vllm/transformers_utils/processors/__init__.py +15 -0
  1344. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1345. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1346. vllm/transformers_utils/processors/ovis.py +453 -0
  1347. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1348. vllm/transformers_utils/runai_utils.py +104 -0
  1349. vllm/transformers_utils/s3_utils.py +95 -0
  1350. vllm/transformers_utils/tokenizer.py +293 -0
  1351. vllm/transformers_utils/tokenizer_base.py +155 -0
  1352. vllm/transformers_utils/tokenizers/__init__.py +16 -0
  1353. vllm/transformers_utils/tokenizers/mistral.py +502 -0
  1354. vllm/transformers_utils/utils.py +130 -0
  1355. vllm/triton_utils/__init__.py +19 -0
  1356. vllm/triton_utils/importing.py +103 -0
  1357. vllm/usage/__init__.py +0 -0
  1358. vllm/usage/usage_lib.py +294 -0
  1359. vllm/utils/__init__.py +82 -0
  1360. vllm/utils/argparse_utils.py +487 -0
  1361. vllm/utils/async_utils.py +303 -0
  1362. vllm/utils/cache.py +214 -0
  1363. vllm/utils/collection_utils.py +139 -0
  1364. vllm/utils/counter.py +45 -0
  1365. vllm/utils/deep_gemm.py +391 -0
  1366. vllm/utils/flashinfer.py +490 -0
  1367. vllm/utils/func_utils.py +236 -0
  1368. vllm/utils/gc_utils.py +147 -0
  1369. vllm/utils/hashing.py +63 -0
  1370. vllm/utils/import_utils.py +411 -0
  1371. vllm/utils/jsontree.py +165 -0
  1372. vllm/utils/math_utils.py +32 -0
  1373. vllm/utils/mem_constants.py +13 -0
  1374. vllm/utils/mem_utils.py +232 -0
  1375. vllm/utils/nccl.py +64 -0
  1376. vllm/utils/network_utils.py +331 -0
  1377. vllm/utils/platform_utils.py +59 -0
  1378. vllm/utils/profiling.py +56 -0
  1379. vllm/utils/registry.py +49 -0
  1380. vllm/utils/serial_utils.py +169 -0
  1381. vllm/utils/system_utils.py +229 -0
  1382. vllm/utils/tensor_schema.py +255 -0
  1383. vllm/utils/torch_utils.py +657 -0
  1384. vllm/v1/__init__.py +0 -0
  1385. vllm/v1/attention/__init__.py +0 -0
  1386. vllm/v1/attention/backends/__init__.py +0 -0
  1387. vllm/v1/attention/backends/cpu_attn.py +496 -0
  1388. vllm/v1/attention/backends/flash_attn.py +1028 -0
  1389. vllm/v1/attention/backends/flashinfer.py +1572 -0
  1390. vllm/v1/attention/backends/flex_attention.py +926 -0
  1391. vllm/v1/attention/backends/gdn_attn.py +387 -0
  1392. vllm/v1/attention/backends/linear_attn.py +74 -0
  1393. vllm/v1/attention/backends/mamba1_attn.py +165 -0
  1394. vllm/v1/attention/backends/mamba2_attn.py +354 -0
  1395. vllm/v1/attention/backends/mamba_attn.py +115 -0
  1396. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1397. vllm/v1/attention/backends/mla/common.py +2031 -0
  1398. vllm/v1/attention/backends/mla/cutlass_mla.py +275 -0
  1399. vllm/v1/attention/backends/mla/flashattn_mla.py +337 -0
  1400. vllm/v1/attention/backends/mla/flashinfer_mla.py +171 -0
  1401. vllm/v1/attention/backends/mla/flashmla.py +314 -0
  1402. vllm/v1/attention/backends/mla/flashmla_sparse.py +548 -0
  1403. vllm/v1/attention/backends/mla/indexer.py +362 -0
  1404. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +294 -0
  1405. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1406. vllm/v1/attention/backends/pallas.py +436 -0
  1407. vllm/v1/attention/backends/rocm_aiter_fa.py +816 -0
  1408. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +196 -0
  1409. vllm/v1/attention/backends/rocm_attn.py +362 -0
  1410. vllm/v1/attention/backends/short_conv_attn.py +105 -0
  1411. vllm/v1/attention/backends/tree_attn.py +425 -0
  1412. vllm/v1/attention/backends/triton_attn.py +373 -0
  1413. vllm/v1/attention/backends/utils.py +1116 -0
  1414. vllm/v1/attention/backends/xformers.py +417 -0
  1415. vllm/v1/core/__init__.py +0 -0
  1416. vllm/v1/core/block_pool.py +428 -0
  1417. vllm/v1/core/encoder_cache_manager.py +343 -0
  1418. vllm/v1/core/kv_cache_coordinator.py +480 -0
  1419. vllm/v1/core/kv_cache_manager.py +420 -0
  1420. vllm/v1/core/kv_cache_utils.py +1340 -0
  1421. vllm/v1/core/sched/__init__.py +0 -0
  1422. vllm/v1/core/sched/async_scheduler.py +62 -0
  1423. vllm/v1/core/sched/interface.py +181 -0
  1424. vllm/v1/core/sched/output.py +202 -0
  1425. vllm/v1/core/sched/request_queue.py +221 -0
  1426. vllm/v1/core/sched/scheduler.py +1617 -0
  1427. vllm/v1/core/sched/utils.py +72 -0
  1428. vllm/v1/core/single_type_kv_cache_manager.py +736 -0
  1429. vllm/v1/cudagraph_dispatcher.py +148 -0
  1430. vllm/v1/engine/__init__.py +206 -0
  1431. vllm/v1/engine/async_llm.py +797 -0
  1432. vllm/v1/engine/coordinator.py +377 -0
  1433. vllm/v1/engine/core.py +1420 -0
  1434. vllm/v1/engine/core_client.py +1400 -0
  1435. vllm/v1/engine/detokenizer.py +351 -0
  1436. vllm/v1/engine/exceptions.py +18 -0
  1437. vllm/v1/engine/llm_engine.py +408 -0
  1438. vllm/v1/engine/logprobs.py +182 -0
  1439. vllm/v1/engine/output_processor.py +642 -0
  1440. vllm/v1/engine/parallel_sampling.py +145 -0
  1441. vllm/v1/engine/processor.py +621 -0
  1442. vllm/v1/engine/utils.py +1072 -0
  1443. vllm/v1/executor/__init__.py +6 -0
  1444. vllm/v1/executor/abstract.py +352 -0
  1445. vllm/v1/executor/multiproc_executor.py +877 -0
  1446. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1447. vllm/v1/executor/ray_executor.py +626 -0
  1448. vllm/v1/executor/ray_utils.py +465 -0
  1449. vllm/v1/executor/uniproc_executor.py +183 -0
  1450. vllm/v1/kv_cache_interface.py +403 -0
  1451. vllm/v1/kv_offload/__init__.py +0 -0
  1452. vllm/v1/kv_offload/abstract.py +161 -0
  1453. vllm/v1/kv_offload/arc_manager.py +237 -0
  1454. vllm/v1/kv_offload/backend.py +97 -0
  1455. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1456. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1457. vllm/v1/kv_offload/cpu.py +93 -0
  1458. vllm/v1/kv_offload/factory.py +56 -0
  1459. vllm/v1/kv_offload/lru_manager.py +139 -0
  1460. vllm/v1/kv_offload/mediums.py +39 -0
  1461. vllm/v1/kv_offload/spec.py +62 -0
  1462. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1463. vllm/v1/kv_offload/worker/cpu_gpu.py +185 -0
  1464. vllm/v1/kv_offload/worker/worker.py +144 -0
  1465. vllm/v1/metrics/__init__.py +0 -0
  1466. vllm/v1/metrics/loggers.py +1238 -0
  1467. vllm/v1/metrics/prometheus.py +82 -0
  1468. vllm/v1/metrics/ray_wrappers.py +169 -0
  1469. vllm/v1/metrics/reader.py +257 -0
  1470. vllm/v1/metrics/stats.py +420 -0
  1471. vllm/v1/outputs.py +249 -0
  1472. vllm/v1/pool/__init__.py +0 -0
  1473. vllm/v1/pool/metadata.py +82 -0
  1474. vllm/v1/request.py +259 -0
  1475. vllm/v1/sample/__init__.py +0 -0
  1476. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1477. vllm/v1/sample/logits_processor/builtin.py +274 -0
  1478. vllm/v1/sample/logits_processor/interface.py +106 -0
  1479. vllm/v1/sample/logits_processor/state.py +165 -0
  1480. vllm/v1/sample/metadata.py +44 -0
  1481. vllm/v1/sample/ops/__init__.py +0 -0
  1482. vllm/v1/sample/ops/bad_words.py +52 -0
  1483. vllm/v1/sample/ops/logprobs.py +25 -0
  1484. vllm/v1/sample/ops/penalties.py +57 -0
  1485. vllm/v1/sample/ops/topk_topp_sampler.py +290 -0
  1486. vllm/v1/sample/rejection_sampler.py +793 -0
  1487. vllm/v1/sample/sampler.py +316 -0
  1488. vllm/v1/sample/tpu/__init__.py +0 -0
  1489. vllm/v1/sample/tpu/metadata.py +120 -0
  1490. vllm/v1/sample/tpu/sampler.py +215 -0
  1491. vllm/v1/serial_utils.py +532 -0
  1492. vllm/v1/spec_decode/__init__.py +0 -0
  1493. vllm/v1/spec_decode/eagle.py +1225 -0
  1494. vllm/v1/spec_decode/medusa.py +73 -0
  1495. vllm/v1/spec_decode/metadata.py +66 -0
  1496. vllm/v1/spec_decode/metrics.py +224 -0
  1497. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1498. vllm/v1/spec_decode/suffix_decoding.py +103 -0
  1499. vllm/v1/spec_decode/utils.py +16 -0
  1500. vllm/v1/structured_output/__init__.py +338 -0
  1501. vllm/v1/structured_output/backend_guidance.py +265 -0
  1502. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1503. vllm/v1/structured_output/backend_outlines.py +324 -0
  1504. vllm/v1/structured_output/backend_types.py +136 -0
  1505. vllm/v1/structured_output/backend_xgrammar.py +362 -0
  1506. vllm/v1/structured_output/request.py +94 -0
  1507. vllm/v1/structured_output/utils.py +469 -0
  1508. vllm/v1/utils.py +414 -0
  1509. vllm/v1/worker/__init__.py +0 -0
  1510. vllm/v1/worker/block_table.py +327 -0
  1511. vllm/v1/worker/cpu_model_runner.py +122 -0
  1512. vllm/v1/worker/cpu_worker.py +206 -0
  1513. vllm/v1/worker/dp_utils.py +230 -0
  1514. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1515. vllm/v1/worker/gpu_input_batch.py +975 -0
  1516. vllm/v1/worker/gpu_model_runner.py +5102 -0
  1517. vllm/v1/worker/gpu_ubatch_wrapper.py +466 -0
  1518. vllm/v1/worker/gpu_worker.py +894 -0
  1519. vllm/v1/worker/kv_connector_model_runner_mixin.py +144 -0
  1520. vllm/v1/worker/lora_model_runner_mixin.py +213 -0
  1521. vllm/v1/worker/tpu_input_batch.py +593 -0
  1522. vllm/v1/worker/tpu_model_runner.py +2173 -0
  1523. vllm/v1/worker/tpu_worker.py +355 -0
  1524. vllm/v1/worker/ubatch_utils.py +73 -0
  1525. vllm/v1/worker/ubatching.py +231 -0
  1526. vllm/v1/worker/utils.py +366 -0
  1527. vllm/v1/worker/worker_base.py +375 -0
  1528. vllm/v1/worker/xpu_model_runner.py +55 -0
  1529. vllm/v1/worker/xpu_worker.py +189 -0
  1530. vllm/version.py +39 -0
  1531. vllm/vllm_flash_attn/.gitkeep +0 -0
  1532. vllm_cpu_amxbf16-0.11.2.post2.dist-info/METADATA +345 -0
  1533. vllm_cpu_amxbf16-0.11.2.post2.dist-info/RECORD +1536 -0
  1534. vllm_cpu_amxbf16-0.11.2.post2.dist-info/WHEEL +5 -0
  1535. vllm_cpu_amxbf16-0.11.2.post2.dist-info/entry_points.txt +5 -0
  1536. vllm_cpu_amxbf16-0.11.2.post2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3222 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """
4
+ This module defines a framework for sampling benchmark requests from various
5
+ datasets. Each dataset subclass of BenchmarkDataset must implement sample
6
+ generation. Supported dataset types include:
7
+ - ShareGPT
8
+ - Random (synthetic)
9
+ - Sonnet
10
+ - BurstGPT
11
+ - HuggingFace
12
+ - VisionArena
13
+ """
14
+
15
+ import argparse
16
+ import ast
17
+ import base64
18
+ import io
19
+ import json
20
+ import logging
21
+ import math
22
+ import random
23
+ from abc import ABC, abstractmethod
24
+ from collections.abc import Callable, Iterator, Mapping
25
+ from contextlib import suppress
26
+ from copy import deepcopy
27
+ from dataclasses import dataclass
28
+ from functools import cache
29
+ from io import BytesIO
30
+ from tempfile import NamedTemporaryFile
31
+ from typing import Any, cast
32
+
33
+ import numpy as np
34
+ from PIL import Image
35
+ from transformers import PreTrainedTokenizerBase
36
+ from typing_extensions import deprecated
37
+
38
+ from vllm.lora.request import LoRARequest
39
+ from vllm.lora.utils import get_adapter_absolute_path
40
+ from vllm.multimodal import MultiModalDataDict
41
+ from vllm.multimodal.image import convert_image_mode
42
+ from vllm.transformers_utils.tokenizer import AnyTokenizer
43
+ from vllm.utils.import_utils import PlaceholderModule
44
+
45
+ try:
46
+ from datasets import load_dataset
47
+ except ImportError:
48
+ datasets = PlaceholderModule("datasets")
49
+ load_dataset = datasets.placeholder_attr("load_dataset")
50
+
51
+ try:
52
+ import pandas as pd
53
+ except ImportError:
54
+ pd = PlaceholderModule("pandas")
55
+
56
+ try:
57
+ import librosa
58
+ except ImportError:
59
+ librosa = PlaceholderModule("librosa")
60
+
61
+ try:
62
+ from vllm.utils.argparse_utils import FlexibleArgumentParser
63
+ except ImportError:
64
+ from argparse import ArgumentParser as FlexibleArgumentParser
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+ # -----------------------------------------------------------------------------
69
+ # Data Classes
70
+ # -----------------------------------------------------------------------------
71
+
72
+
73
+ @dataclass
74
+ class SampleRequest:
75
+ """
76
+ Represents a single inference request for benchmarking.
77
+ """
78
+
79
+ prompt: str | list[str]
80
+ prompt_len: int
81
+ expected_output_len: int
82
+ multi_modal_data: MultiModalDataDict | dict | list[dict] | None = None
83
+ lora_request: LoRARequest | None = None
84
+ request_id: str | None = None
85
+
86
+
87
+ # -----------------------------------------------------------------------------
88
+ # Benchmark Dataset Base Class
89
+ # -----------------------------------------------------------------------------
90
+
91
+
92
+ class BenchmarkDataset(ABC):
93
+ DEFAULT_SEED = 0
94
+ IS_MULTIMODAL = False
95
+
96
+ def __init__(
97
+ self,
98
+ dataset_path: str | None = None,
99
+ random_seed: int = DEFAULT_SEED,
100
+ disable_shuffle: bool = False,
101
+ **kwargs,
102
+ ) -> None:
103
+ """
104
+ Initialize the BenchmarkDataset with an optional dataset path and random
105
+ seed.
106
+
107
+ Args:
108
+ dataset_path (Optional[str]): Path to the dataset. If None, it
109
+ indicates that a default or random dataset might be used.
110
+ random_seed (int): Seed value for reproducible shuffling or
111
+ sampling. Defaults to DEFAULT_SEED.
112
+ """
113
+ self.dataset_path = dataset_path
114
+ # Set the random seed, ensuring that a None value is replaced with the
115
+ # default seed.
116
+ self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
117
+ self.disable_shuffle = disable_shuffle
118
+ self.data = None
119
+
120
+ def apply_multimodal_chat_transformation(
121
+ self,
122
+ prompt: str,
123
+ mm_content: MultiModalDataDict | dict | list[dict] | None = None,
124
+ ) -> list[dict]:
125
+ """
126
+ Transform a prompt and optional multimodal content into a chat format.
127
+ This method is used for chat models that expect a specific conversation
128
+ format.
129
+ """
130
+ content = [{"text": prompt, "type": "text"}]
131
+ if mm_content is not None:
132
+ if isinstance(mm_content, list):
133
+ content.extend(cast(list[dict[str, Any]], mm_content))
134
+ elif isinstance(mm_content, dict):
135
+ content.append(mm_content)
136
+ else:
137
+ raise TypeError(
138
+ "Could not process multimodal content of type: "
139
+ + f"{type(mm_content)}"
140
+ )
141
+ return [{"role": "user", "content": content}]
142
+
143
+ def load_data(self) -> None:
144
+ """
145
+ Load data from the dataset path into self.data.
146
+
147
+ This method must be overridden by subclasses since the method to load
148
+ data will vary depending on the dataset format and source.
149
+
150
+ Raises:
151
+ NotImplementedError: If a subclass does not implement this method.
152
+ """
153
+ # TODO (jenniferzhao): add support for downloading data
154
+ raise NotImplementedError("load_data must be implemented in subclasses.")
155
+
156
+ def get_random_lora_request(
157
+ self,
158
+ max_loras: int | None = None,
159
+ lora_path: str | None = None,
160
+ ) -> LoRARequest | None:
161
+ """
162
+ Optionally select a random LoRA request.
163
+
164
+ This method is used when LoRA parameters are provided. It randomly
165
+ selects a LoRA based on max_loras.
166
+
167
+ Args:
168
+ max_loras (Optional[int]): The maximum number of LoRAs available.
169
+ If `None`, LoRA is not used.
170
+ lora_path (Optional[str]): Path to the LoRA parameters on disk.
171
+ If `None`, LoRA is not used.
172
+
173
+ Returns:
174
+ A new [`LoRARequest`][vllm.lora.request.LoRARequest]
175
+ (or `None` if not applicable).
176
+ """
177
+ if max_loras is None or lora_path is None:
178
+ return None
179
+
180
+ # Generate a random LoRA ID in the range [1, max_loras].
181
+ lora_id = random.randint(1, max_loras)
182
+ lora_request = LoRARequest(
183
+ lora_name=str(lora_id),
184
+ lora_int_id=lora_id,
185
+ lora_path=lora_path_on_disk(lora_path),
186
+ )
187
+ return lora_request
188
+
189
+ @abstractmethod
190
+ def sample(
191
+ self,
192
+ tokenizer: PreTrainedTokenizerBase,
193
+ num_requests: int,
194
+ request_id_prefix: str = "",
195
+ no_oversample: bool = False,
196
+ ) -> list[SampleRequest]:
197
+ """
198
+ Abstract method to generate sample requests from the dataset.
199
+
200
+ Subclasses must override this method to implement dataset-specific logic
201
+ for generating a list of SampleRequest objects.
202
+
203
+ Args:
204
+ tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
205
+ for processing the dataset's text.
206
+ num_requests (int): The number of sample requests to generate.
207
+ request_id_prefix (str): The prefix of request_id.
208
+
209
+ Returns:
210
+ list[SampleRequest]: A list of sample requests generated from the
211
+ dataset.
212
+ """
213
+ raise NotImplementedError("sample must be implemented in subclasses.")
214
+
215
+ def maybe_oversample_requests(
216
+ self,
217
+ requests: list[SampleRequest],
218
+ num_requests: int,
219
+ request_id_prefix: str = "",
220
+ no_oversample: bool = False,
221
+ ) -> None:
222
+ """
223
+ Oversamples the list of requests if its size is less than the desired
224
+ number.
225
+
226
+ Args:
227
+ requests (List[SampleRequest]): The current list of sampled
228
+ requests.
229
+ num_requests (int): The target number of requests.
230
+ request_id_prefix (str): The prefix applied to generated request
231
+ identifiers.
232
+
233
+ """
234
+ if no_oversample:
235
+ logger.info("Skipping oversampling. Total samples: %d.", len(requests))
236
+ return
237
+
238
+ if len(requests) < num_requests:
239
+ random.seed(self.random_seed)
240
+ needed = num_requests - len(requests)
241
+ additional = []
242
+ for i in range(needed):
243
+ req = deepcopy(random.choice(requests))
244
+ req.request_id = request_id_prefix + str(len(requests) + i)
245
+ additional.append(req)
246
+ requests.extend(additional)
247
+ logger.info("Oversampled requests to reach %d total samples.", num_requests)
248
+
249
+ ids = [req.request_id for req in requests]
250
+ if len(ids) != len(set(ids)):
251
+ raise ValueError(
252
+ "Duplicate request_id found in the sampled "
253
+ "requests. Please ensure that each request_id "
254
+ "is unique."
255
+ )
256
+
257
+
258
+ # -----------------------------------------------------------------------------
259
+ # Utility Functions and Global Caches
260
+ # -----------------------------------------------------------------------------
261
+
262
+
263
+ def is_valid_sequence(
264
+ prompt_len: int,
265
+ output_len: int,
266
+ min_len: int = 4,
267
+ max_prompt_len: int = 1024,
268
+ max_total_len: int = 2048,
269
+ skip_min_output_len_check: bool = False,
270
+ ) -> bool:
271
+ """
272
+ Validate a sequence based on prompt and output lengths.
273
+
274
+ Default pruning criteria are copied from the original `sample_hf_requests`
275
+ and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
276
+ from `sample_requests` in benchmark_throughput.py.
277
+ """
278
+ # Check for invalid conditions
279
+ prompt_too_short = prompt_len < min_len
280
+ output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
281
+ prompt_too_long = prompt_len > max_prompt_len
282
+ combined_too_long = (prompt_len + output_len) > max_total_len
283
+
284
+ # Return True if none of the invalid conditions are met
285
+ return not (
286
+ prompt_too_short or output_too_short or prompt_too_long or combined_too_long
287
+ )
288
+
289
+
290
+ @cache
291
+ def lora_path_on_disk(lora_path: str) -> str:
292
+ return get_adapter_absolute_path(lora_path)
293
+
294
+
295
+ # Global cache for LoRA tokenizers.
296
+ lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
297
+
298
+
299
+ def process_image(image: Any) -> Mapping[str, Any]:
300
+ """
301
+ Process a single image input and return a multimedia content dictionary.
302
+
303
+ Supports the following input types:
304
+
305
+ 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
306
+ containing raw image data. - Loads the bytes as a PIL.Image.Image.
307
+
308
+ 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
309
+ a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
310
+ a dictionary with the image as a base64 data URL.
311
+
312
+ 3. String input: - Treats the string as a URL or local file path. -
313
+ Prepends "file://" if the string doesn't start with "http://" or
314
+ "file://". - Returns a dictionary with the image URL.
315
+
316
+ Raises:
317
+ ValueError: If the input is not a supported type.
318
+ """
319
+ if isinstance(image, dict) and "bytes" in image:
320
+ image = Image.open(BytesIO(image["bytes"]))
321
+ if isinstance(image, Image.Image):
322
+ image = convert_image_mode(image, "RGB")
323
+ with io.BytesIO() as image_data:
324
+ image.save(image_data, format="JPEG")
325
+ image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
326
+ return {
327
+ "type": "image_url",
328
+ "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
329
+ }
330
+
331
+ if isinstance(image, str):
332
+ image_url = (
333
+ image
334
+ if image.startswith(("http://", "https://", "file://"))
335
+ else f"file://{image}"
336
+ )
337
+ return {"type": "image_url", "image_url": {"url": image_url}}
338
+
339
+ raise ValueError(
340
+ f"Invalid image input {image}. Must be a PIL.Image.Image"
341
+ " or str or dictionary with raw image bytes."
342
+ )
343
+
344
+
345
+ def process_video(video: Any) -> Mapping[str, Any]:
346
+ """
347
+ Process a single video input and return a multimedia content dictionary.
348
+
349
+ Supports the following input types:
350
+
351
+ 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
352
+ containing raw video data.
353
+
354
+ 2. String input: - Treats the string as a URL or local file path. -
355
+ Prepends "file://" if the string doesn't start with "http://" or
356
+ "file://". - Returns a dictionary with the image URL.
357
+
358
+ Raises:
359
+ ValueError: If the input is not a supported type.
360
+ """
361
+ if isinstance(video, dict) and "bytes" in video:
362
+ video_bytes = video["bytes"]
363
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
364
+ return {
365
+ "type": "video_url",
366
+ "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
367
+ }
368
+
369
+ if isinstance(video, str):
370
+ video_url = (
371
+ video
372
+ if video.startswith(("http://", "https://", "file://"))
373
+ else f"file://{video}"
374
+ )
375
+ return {"type": "video_url", "video_url": {"url": video_url}}
376
+
377
+ raise ValueError(
378
+ f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
379
+ )
380
+
381
+
382
+ def gen_prompt_decode_to_target_len(
383
+ tokenizer: PreTrainedTokenizerBase,
384
+ token_sequence: list[int],
385
+ target_token_len: int,
386
+ max_retry: int = 10,
387
+ add_special_tokens: bool = False,
388
+ rng: np.random.Generator | None = None,
389
+ ) -> tuple[str, list[int]]:
390
+ """
391
+ Ensure decoded-then-encoded prompt length matches the target token length.
392
+
393
+ This function decodes an initial token sequence to text and re-encodes it
394
+ , iteratively adjusting the token sequence length to match a target.
395
+ This is necessary because some tokenizers do not guarantee a 1:1 mapping
396
+ between consecutive tokens and the decoded-then-encoded sequence length.
397
+ For example, for GPT2Tokenizer:
398
+ [6880, 6881] -> ['Ġcalls', 'here'] ->
399
+ [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
400
+
401
+ Returns a tuple of the final prompt string and the adjusted token sequence.
402
+ """
403
+ remain_num_try = max_retry
404
+ token_mismatch = 0
405
+ while True:
406
+ prompt = tokenizer.decode(token_sequence)
407
+ token_sequence = tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
408
+ if remain_num_try <= 0:
409
+ if len(token_sequence) != target_token_len:
410
+ token_mismatch = len(token_sequence) - target_token_len
411
+ break
412
+
413
+ if len(token_sequence) == target_token_len:
414
+ break
415
+ elif len(token_sequence) < target_token_len:
416
+ if rng is not None:
417
+ extra_tokens = rng.integers(
418
+ 0,
419
+ tokenizer.vocab_size,
420
+ size=target_token_len - len(token_sequence),
421
+ ).tolist()
422
+ else:
423
+ extra_tokens = np.random.randint(
424
+ 0,
425
+ tokenizer.vocab_size,
426
+ size=target_token_len - len(token_sequence),
427
+ ).tolist()
428
+ token_sequence.extend(extra_tokens)
429
+ elif len(token_sequence) > target_token_len:
430
+ token_sequence = token_sequence[:target_token_len]
431
+
432
+ remain_num_try -= 1
433
+
434
+ return prompt, token_sequence, token_mismatch
435
+
436
+
437
+ # -----------------------------------------------------------------------------
438
+ # Random Dataset Implementation (Synthetic Data)
439
+ # -----------------------------------------------------------------------------
440
+
441
+
442
+ class RandomDataset(BenchmarkDataset):
443
+ """
444
+ Synthetic text-only dataset for serving/throughput benchmarks.
445
+
446
+ Strategy:
447
+ - Sample input/output token lengths per request from integer-uniform ranges
448
+ around configured means (controlled by range_ratio).
449
+ - Prepend a fixed random prefix of length prefix_len.
450
+ - Generate the remaining tokens as a reproducible sequence:
451
+ (offset + index + arange(input_len)) % vocab_size.
452
+ - Decode then re-encode/truncate to ensure prompt token counts match.
453
+ - Uses numpy.default_rng seeded with random_seed for reproducible sampling.
454
+ """
455
+
456
+ # Default values copied from benchmark_serving.py for the random dataset.
457
+ DEFAULT_PREFIX_LEN = 0
458
+ DEFAULT_RANGE_RATIO = 0.0
459
+ DEFAULT_INPUT_LEN = 1024
460
+ DEFAULT_OUTPUT_LEN = 128
461
+
462
+ def __init__(self, **kwargs) -> None:
463
+ super().__init__(**kwargs)
464
+ # Use numpy's default_rng for deterministic sampling
465
+ # Do not use random.seed() or np.random.seed() elsewhere in this class.
466
+ # This ensures that the RNG is isolated from global RNG state.
467
+ self._rng = np.random.default_rng(self.random_seed)
468
+
469
+ def sample(
470
+ self,
471
+ tokenizer: PreTrainedTokenizerBase,
472
+ num_requests: int,
473
+ request_id_prefix: str = "",
474
+ no_oversample: bool = False,
475
+ prefix_len: int = DEFAULT_PREFIX_LEN,
476
+ range_ratio: float = DEFAULT_RANGE_RATIO,
477
+ input_len: int = DEFAULT_INPUT_LEN,
478
+ output_len: int = DEFAULT_OUTPUT_LEN,
479
+ batchsize: int = 1,
480
+ **kwargs,
481
+ ) -> list[SampleRequest]:
482
+ # validate total input tokens (prefix + sampled) is at least 1.
483
+ num_special = int(tokenizer.num_special_tokens_to_add())
484
+ real_input_len = max(0, int(input_len) - num_special)
485
+ min_sampled_input = math.floor(real_input_len * (1.0 - float(range_ratio)))
486
+ min_total_input = int(prefix_len) + min_sampled_input
487
+ if min_total_input < 1:
488
+ raise ValueError(
489
+ "--random-input-len is too small: with tokenizer special "
490
+ f"tokens {num_special} and --random-range-ratio {range_ratio}, "
491
+ "the minimum possible total input tokens (prefix + sampled) is "
492
+ f"{min_total_input}. Increase --random-input-len and/or "
493
+ "--random-prefix-len, or decrease --random-range-ratio so that "
494
+ "prefix_len + floor(max(0, random_input_len - num_special)) "
495
+ "* (1 - range_ratio) >= 1."
496
+ )
497
+
498
+ input_lens, output_lens, offsets = self.get_sampling_params(
499
+ num_requests, range_ratio, input_len, output_len, tokenizer
500
+ )
501
+
502
+ vocab_size = tokenizer.vocab_size
503
+ prohibited_tokens = tokenizer.all_special_ids
504
+ all_tokens = np.arange(vocab_size)
505
+ allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
506
+
507
+ # Generate prefix once
508
+ prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len)
509
+
510
+ requests = []
511
+ token_mismatch_total = 0
512
+ for i in range(num_requests):
513
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
514
+ tokenizer=tokenizer,
515
+ prefix_token_ids=prefix_token_ids,
516
+ prefix_len=prefix_len,
517
+ vocab_size=vocab_size,
518
+ input_len=int(input_lens[i]),
519
+ offset=int(offsets[i]),
520
+ index=i,
521
+ allowed_tokens=allowed_tokens,
522
+ )
523
+ token_mismatch_total += token_mismatch
524
+ requests.append(
525
+ SampleRequest(
526
+ prompt=prompt,
527
+ prompt_len=total_input_len,
528
+ expected_output_len=int(output_lens[i]),
529
+ request_id=request_id_prefix + str(i),
530
+ )
531
+ )
532
+ # only used for embeddings benchmark.
533
+ if batchsize > 1:
534
+ batch_requests = []
535
+ # Create batched requests
536
+ for i in range(0, num_requests, batchsize):
537
+ batch = requests[i : i + batchsize]
538
+ batch_requests.append(
539
+ SampleRequest(
540
+ prompt=[req.prompt for req in batch],
541
+ prompt_len=sum(req.prompt_len for req in batch),
542
+ expected_output_len=0,
543
+ request_id=request_id_prefix + str(i // batchsize),
544
+ )
545
+ )
546
+ requests = batch_requests
547
+
548
+ if token_mismatch_total != 0:
549
+ sign = "more" if token_mismatch_total > 0 else "fewer"
550
+ logger.warning(
551
+ "Across all generated prompts, there were %d %s tokens "
552
+ "than expected after decoding and re-encoding. This is "
553
+ "expected due to the imperfect nature of the sampling "
554
+ "procedure.",
555
+ abs(token_mismatch_total),
556
+ sign,
557
+ )
558
+
559
+ return requests
560
+
561
+ def get_prefix(
562
+ self,
563
+ allowed_tokens: np.ndarray,
564
+ prefix_len: int,
565
+ ) -> list[int]:
566
+ """
567
+ Get the prefix for the dataset.
568
+ """
569
+ return (
570
+ allowed_tokens[
571
+ self._rng.integers(0, len(allowed_tokens), size=prefix_len)
572
+ ].tolist()
573
+ if prefix_len > 0
574
+ else []
575
+ )
576
+
577
+ def get_sampling_params(
578
+ self,
579
+ num_requests: int,
580
+ range_ratio: float,
581
+ input_len: int,
582
+ output_len: int,
583
+ tokenizer: PreTrainedTokenizerBase,
584
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
585
+ """
586
+ Get the sampling parameters for the dataset.
587
+ """
588
+ # Enforce range_ratio < 1
589
+ if not (0.0 <= range_ratio < 1.0):
590
+ raise ValueError("range_ratio must be in [0, 1).")
591
+ num_special_tokens = int(tokenizer.num_special_tokens_to_add())
592
+ real_input_len = max(0, int(input_len) - num_special_tokens)
593
+ # Bounds use floor for low and ceil for high
594
+ input_low = math.floor(real_input_len * (1 - range_ratio))
595
+ input_high = math.ceil(real_input_len * (1 + range_ratio))
596
+ output_low = math.floor(output_len * (1 - range_ratio))
597
+ output_high = math.ceil(output_len * (1 + range_ratio))
598
+ # Ensure the lower bound for output length is at least 1 to
599
+ # prevent sampling 0 tokens.
600
+ output_low = max(output_low, 1)
601
+ output_high = max(output_high, 1)
602
+
603
+ if input_low > input_high:
604
+ raise ValueError(
605
+ f"Invalid input sampling interval: low={input_low} > high={input_high}"
606
+ )
607
+ if output_low > output_high:
608
+ raise ValueError(
609
+ "Invalid output sampling interval: "
610
+ f"low={output_low} > high={output_high}"
611
+ )
612
+
613
+ logger.info(
614
+ "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
615
+ input_low,
616
+ input_high,
617
+ output_low,
618
+ output_high,
619
+ )
620
+
621
+ input_lens = self._rng.integers(input_low, input_high + 1, size=num_requests)
622
+ output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests)
623
+ offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests)
624
+ return input_lens, output_lens, offsets
625
+
626
+ def generate_token_sequence(
627
+ self,
628
+ *,
629
+ tokenizer: PreTrainedTokenizerBase,
630
+ prefix_token_ids: list[int],
631
+ prefix_len: int,
632
+ vocab_size: int,
633
+ input_len: int,
634
+ offset: int,
635
+ index: int,
636
+ allowed_tokens: np.ndarray,
637
+ ) -> tuple[str, int, int]:
638
+ """
639
+ Returns (prompt, total_input_len).
640
+
641
+ NOTE: After decoding the prompt we have to encode and decode it again.
642
+ This is done because in some cases N consecutive tokens
643
+ give a string tokenized into != N number of tokens.
644
+ For example for GPT2Tokenizer:
645
+ [6880, 6881] -> ['Ġcalls', 'here'] ->
646
+ [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
647
+ To avoid uncontrolled change of the prompt length,
648
+ the encoded sequence is truncated before being decoded again.
649
+ """
650
+ # Build the inner sequence by sampling
651
+ # sequentially from the allowed tokens
652
+ inner_seq = allowed_tokens[
653
+ (offset + index + np.arange(input_len)) % len(allowed_tokens)
654
+ ].tolist()
655
+ token_sequence = prefix_token_ids + inner_seq
656
+
657
+ # Decode, then re-encode and truncate to preserve token count invariants
658
+ total_input_len = prefix_len + int(input_len)
659
+ prompt, adjusted_token_sequence, token_mismatch = (
660
+ gen_prompt_decode_to_target_len(
661
+ tokenizer=tokenizer,
662
+ token_sequence=token_sequence,
663
+ target_token_len=total_input_len,
664
+ add_special_tokens=False,
665
+ rng=self._rng,
666
+ )
667
+ )
668
+ total_input_len = len(adjusted_token_sequence)
669
+ return prompt, total_input_len, token_mismatch
670
+
671
+
672
+ # -----------------------------------------------------------------------------
673
+ # Random Dataset Implementation (Synthetic Data)
674
+ # -----------------------------------------------------------------------------
675
+
676
+
677
+ class RandomDatasetForReranking(RandomDataset):
678
+ """
679
+ Random dataset specialized for the needs of scoring:
680
+ - Batches of inputs
681
+ - Inputs composed of pairs
682
+ """
683
+
684
+ def __init__(self, **kwargs) -> None:
685
+ super().__init__(**kwargs)
686
+
687
+ def sample(
688
+ self,
689
+ tokenizer: PreTrainedTokenizerBase,
690
+ num_requests: int,
691
+ request_id_prefix: str = "",
692
+ range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
693
+ input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
694
+ batchsize: int = 1,
695
+ is_reranker: bool = True,
696
+ **kwargs,
697
+ ) -> list[SampleRequest]:
698
+ n_sep_tokens = int(is_reranker)
699
+
700
+ query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len
701
+
702
+ query_lens, _, query_offsets = self.get_sampling_params(
703
+ 1, range_ratio, query_len_param, 0, tokenizer
704
+ )
705
+
706
+ query_len = int(query_lens[0])
707
+
708
+ if not is_reranker:
709
+ assert num_requests > 1 and batchsize > 1
710
+ num_requests -= 1
711
+ batchsize -= 1
712
+ doc_len_param = input_len
713
+ else:
714
+ doc_len_param = input_len - query_len - n_sep_tokens
715
+
716
+ doc_lens, _, doc_offsets = self.get_sampling_params(
717
+ num_requests, range_ratio, doc_len_param, 0, tokenizer
718
+ )
719
+ vocab_size = tokenizer.vocab_size
720
+
721
+ query_prompt, query_input_len, token_mismatch_total = (
722
+ self.generate_token_sequence(
723
+ tokenizer=tokenizer,
724
+ prefix_token_ids=[],
725
+ prefix_len=0,
726
+ vocab_size=vocab_size,
727
+ input_len=query_len,
728
+ offset=int(query_offsets[0]),
729
+ index=0,
730
+ )
731
+ )
732
+
733
+ requests = []
734
+ for i in range(num_requests):
735
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
736
+ tokenizer=tokenizer,
737
+ prefix_token_ids=[],
738
+ prefix_len=0,
739
+ vocab_size=vocab_size,
740
+ input_len=int(doc_lens[i]),
741
+ offset=int(doc_offsets[i]),
742
+ index=i + 1,
743
+ )
744
+ token_mismatch_total += token_mismatch
745
+ requests.append((prompt, total_input_len))
746
+
747
+ batch_requests = []
748
+ # Create batched requests
749
+ for i in range(0, num_requests, batchsize):
750
+ batch = requests[i : i + batchsize]
751
+ query_contrib = (
752
+ (query_input_len + n_sep_tokens) * len(batch)
753
+ if is_reranker
754
+ else query_input_len
755
+ )
756
+ batch_requests.append(
757
+ SampleRequest(
758
+ prompt=[query_prompt] + [req[0] for req in batch],
759
+ prompt_len=query_contrib + sum(req[1] for req in batch),
760
+ expected_output_len=0,
761
+ request_id=request_id_prefix + str(i // batchsize),
762
+ )
763
+ )
764
+
765
+ if token_mismatch_total != 0:
766
+ logger.warning(
767
+ "Across all generated prompts, there were %d %s tokens "
768
+ "than expected after decoding and re-encoding. This is "
769
+ "expected due to the imperfect nature of the sampling "
770
+ "procedure.",
771
+ abs(token_mismatch_total),
772
+ "more" if token_mismatch_total > 0 else "fewer",
773
+ )
774
+
775
+ return batch_requests
776
+
777
+
778
+ # -----------------------------------------------------------------------------
779
+ # MultiModalDataset Implementation
780
+ # -----------------------------------------------------------------------------
781
+
782
+
783
+ class RandomMultiModalDataset(RandomDataset):
784
+ """
785
+ Synthetic multimodal dataset (text + images) that extends RandomDataset.
786
+
787
+ Status:
788
+ - Images: supported via synthetic RGB data.
789
+ - Video: supported via synthetic RGB data.
790
+ - Audio: not yet supported.
791
+
792
+ Sampling overview:
793
+ 1) Number of items per request is sampled uniformly from the integer range
794
+ [floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is
795
+ `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0.
796
+ The maximum is further clamped to the sum of per-modality limits.
797
+ 2) Each item’s modality and shape is sampled from `bucket_config`, a dict
798
+ mapping (height, width, num_frames) → probability. We treat
799
+ `num_frames`=1 as image and `num_frames` > 1 as video.
800
+ Entries with zero probability are removed and the rest are renormalized
801
+ to sum to 1.
802
+ 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
803
+ When a modality reaches its cap, all of its buckets are excluded and the
804
+ remaining probabilities are renormalized.
805
+
806
+ Example bucket configuration:
807
+ {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1}
808
+ - Two image buckets (`num_frames`=1) and one video bucket
809
+ (`num_frames`=16).
810
+ OBS.: Only image sampling is supported for now.
811
+ """
812
+
813
+ IS_MULTIMODAL = True
814
+ DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 1}
815
+
816
+ DEFAULT_BASE_ITEMS_PER_REQUEST = 1
817
+ DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0
818
+ DEFAULT_MM_ITEM_BUCKET_CONFIG = {
819
+ (256, 256, 1): 0.5,
820
+ (720, 1280, 1): 0.5,
821
+ (720, 1280, 16): 0.0,
822
+ }
823
+ DEFAULT_ENABLE_MULTIMODAL_CHAT = False
824
+
825
+ def __init__(self, **kwargs) -> None:
826
+ super().__init__(**kwargs)
827
+
828
+ def generate_synthetic_image(self, width: int, height: int) -> Image.Image:
829
+ """Generate synthetic PIL image with random RGB values.
830
+
831
+ NOTE: iid pixel sampling results in worst-case compression
832
+ (good for stressing I/O), but very unlike real photos.
833
+ We could consider a “low-freq” mode (e.g., noise blur)
834
+ to emulate network realism instead of max stress.
835
+ """
836
+ random_pixels = self._rng.integers(
837
+ 0,
838
+ 256,
839
+ (height, width, 3),
840
+ dtype=np.uint8,
841
+ )
842
+ return Image.fromarray(random_pixels)
843
+
844
+ def generate_synthetic_video(
845
+ self, width: int, height: int, num_frames: int
846
+ ) -> dict:
847
+ """Generate synthetic video with random values.
848
+
849
+ Creates a video with random pixel values, encodes it to MP4 format,
850
+ and returns the content as bytes.
851
+ """
852
+ import cv2
853
+
854
+ random_pixels = self._rng.integers(
855
+ 0,
856
+ 256,
857
+ (num_frames, height, width, 3),
858
+ dtype=np.uint8,
859
+ )
860
+
861
+ # Create a temporary video file in memory
862
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
863
+ fps = 30 # frames per second
864
+
865
+ with NamedTemporaryFile(suffix=".mp4", delete_on_close=False) as temp_file:
866
+ temp_path = temp_file.name
867
+
868
+ # Create video writer
869
+ video_writer = cv2.VideoWriter(
870
+ temp_path, fourcc=fourcc, fps=fps, frameSize=(width, height)
871
+ )
872
+
873
+ if not video_writer.isOpened():
874
+ raise RuntimeError("Failed to create video writer")
875
+
876
+ for frame in random_pixels:
877
+ video_writer.write(frame)
878
+
879
+ video_writer.release()
880
+ temp_file.close()
881
+
882
+ # Read the video file content
883
+ with open(temp_path, "rb") as f:
884
+ video_content = f.read()
885
+
886
+ return {"bytes": video_content}
887
+
888
+ def map_config_to_modality(self, config: tuple[int, int, int]) -> str:
889
+ """Map the configuration to the modality."""
890
+ if config[-1] == 1:
891
+ return "image"
892
+ elif config[-1] > 1:
893
+ return "video"
894
+ else:
895
+ raise ValueError(f"Invalid multimodal item configuration: {config}")
896
+
897
+ def normalize_bucket_config(
898
+ self, bucket_config: dict[tuple[int, int, int], float]
899
+ ) -> dict[tuple[int, int, int], float]:
900
+ """
901
+ Remove zero probability entries
902
+ and normalize the bucket config to sum to 1.
903
+ """
904
+ # Raise error if value is negative
905
+ if any(v < 0 for v in bucket_config.values()):
906
+ raise ValueError("Bucket config values must be non-negative.")
907
+ # Remove zero probability entries
908
+ bucket_config = {k: v for k, v in bucket_config.items() if v > 0}
909
+ # if bucket config is empty, raise error
910
+ if not bucket_config:
911
+ raise ValueError(
912
+ "Got invalid bucket config. Bucket config values must be non-zero."
913
+ )
914
+ # Normalize the remaining bucket config to sum to 1
915
+ total = sum(bucket_config.values())
916
+ return {k: v / total for k, v in bucket_config.items()}
917
+
918
+ def generate_mm_item(
919
+ self,
920
+ mm_item_config: tuple[int, int, int],
921
+ ) -> Mapping[str, Any]:
922
+ """
923
+ Create synthetic images and videos and
924
+ apply process_image/process_video respectively.
925
+ This follows the OpenAI API chat completions
926
+ https://github.com/openai/openai-python
927
+ """
928
+
929
+ if self.map_config_to_modality(mm_item_config) == "image":
930
+ return process_image(
931
+ self.generate_synthetic_image(mm_item_config[1], mm_item_config[0])
932
+ )
933
+ elif self.map_config_to_modality(mm_item_config) == "video":
934
+ return process_video(
935
+ self.generate_synthetic_video(
936
+ mm_item_config[1], mm_item_config[0], mm_item_config[2]
937
+ )
938
+ )
939
+ else:
940
+ raise ValueError(f"Invalid multimodal item configuration: {mm_item_config}")
941
+
942
+ def get_mm_item_sampling_params(
943
+ self,
944
+ base_items_per_request: int,
945
+ num_mm_items_range_ratio: float,
946
+ limit_mm_per_prompt: dict[str, int],
947
+ bucket_config: dict[tuple[int, int, int], float],
948
+ ) -> tuple[int, int, dict[str, int], dict[tuple[int, int, int], float]]:
949
+ """
950
+ Get the sampling parameters for the multimodal items.
951
+ """
952
+ # Enforce num_mm_items_range_ratio <= 1
953
+ if not (0.0 <= num_mm_items_range_ratio <= 1.0):
954
+ raise ValueError("num_mm_items_range_ratio must be in [0, 1].")
955
+
956
+ # Ensure modalities to sample are in limit_mm_per_prompt
957
+ for k, v in bucket_config.items():
958
+ # get modality from bucket config
959
+ modality = self.map_config_to_modality(k)
960
+ if modality not in limit_mm_per_prompt:
961
+ raise ValueError(
962
+ f"Modality {modality} is not in "
963
+ f"limit_mm_per_prompt: "
964
+ f"{limit_mm_per_prompt.keys()}"
965
+ )
966
+
967
+ # Remove zero probability entries
968
+ # and normalize bucket config to sum to 1
969
+ bucket_config = self.normalize_bucket_config(bucket_config)
970
+ logger.info(
971
+ "Normalized bucket config: %s",
972
+ bucket_config,
973
+ )
974
+ # Only consider limit per prompt for modalities in bucket config
975
+ allowed_modalities = {self.map_config_to_modality(cfg) for cfg in bucket_config}
976
+ limit_mm_per_prompt = {
977
+ k: v for k, v in limit_mm_per_prompt.items() if k in allowed_modalities
978
+ }
979
+ if not limit_mm_per_prompt:
980
+ raise ValueError("No valid limits for modalities present in bucket_config.")
981
+
982
+ logger.info(
983
+ "Updated mm-limit-per-prompt: %s",
984
+ limit_mm_per_prompt,
985
+ )
986
+
987
+ # Get max and min num mm items and ensure
988
+ # it is at most the sum of limit_mm_per_prompt for all modalities
989
+ max_num_mm_items = min(
990
+ sum(limit_mm_per_prompt.values()),
991
+ math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio)),
992
+ )
993
+ # Ensure min num mm items is at least 0
994
+ min_num_mm_items = max(
995
+ 0, math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
996
+ )
997
+ # Raise error if min num mm items is greater than max num mm items
998
+ if min_num_mm_items > max_num_mm_items:
999
+ raise ValueError(
1000
+ f"Min num mm items is greater than max mm items: "
1001
+ f"{min_num_mm_items} > {max_num_mm_items}"
1002
+ )
1003
+
1004
+ logger.info(
1005
+ "Sampling number of multimodal items from [%s, %s]",
1006
+ min_num_mm_items,
1007
+ max_num_mm_items,
1008
+ )
1009
+
1010
+ return (
1011
+ min_num_mm_items,
1012
+ max_num_mm_items,
1013
+ limit_mm_per_prompt,
1014
+ bucket_config,
1015
+ )
1016
+
1017
+ def get_mm_item_iterator(
1018
+ self,
1019
+ min_num_mm_items: int,
1020
+ max_num_mm_items: int,
1021
+ bucket_config: dict[tuple[int, int, int], float],
1022
+ limit_mm_per_prompt: dict[str, int],
1023
+ ) -> Iterator[tuple[int, int, int]]:
1024
+ """
1025
+ Iterator over the multimodal items for each request
1026
+ whose size is between min_num_mm_items and max_num_mm_items.
1027
+
1028
+ Loop over the bucket config and sample a multimodal item.
1029
+ Loop until the number of multimodal items sampled is equal to
1030
+ request_num_mm_items or limit of multimodal items per prompt
1031
+ for all modalities is reached.
1032
+
1033
+ Note:
1034
+ - This function operates on a per-request shallow copy of
1035
+ `bucket_config` (tuple->float). The original dict passed to
1036
+ `sample` is not mutated. If this ever changes, a test
1037
+ is implemented and will fail.
1038
+ """
1039
+ # Get the number of multimodal items to sample
1040
+ request_num_mm_items = int(
1041
+ self._rng.integers(min_num_mm_items, max_num_mm_items + 1)
1042
+ )
1043
+ # If request_num_mm_items is 0, yield an empty iterator
1044
+ if request_num_mm_items == 0:
1045
+ return
1046
+ # Initialize modality counters
1047
+ modality_counter = {self.map_config_to_modality(k): 0 for k in bucket_config}
1048
+ # Copy the bucket config to avoid modifying the original
1049
+ bucket_config_copy = bucket_config.copy()
1050
+ # Loop over the number of multimodal items to sample
1051
+ while sum(modality_counter.values()) < request_num_mm_items:
1052
+ # Sample a multimodal item config
1053
+ mm_item_config = self._rng.choice(
1054
+ list(bucket_config_copy.keys()), p=list(bucket_config_copy.values())
1055
+ )
1056
+ modality = self.map_config_to_modality(mm_item_config)
1057
+ # Check that modality count is less than limit per prompt
1058
+ if modality_counter[modality] < limit_mm_per_prompt[modality]:
1059
+ modality_counter[modality] += 1
1060
+ yield (mm_item_config)
1061
+ else:
1062
+ # If the counter is greater than the limit per prompt
1063
+ # set all multimodal items of this modality to 0
1064
+ for k, v in bucket_config_copy.items():
1065
+ if self.map_config_to_modality(k) == modality:
1066
+ bucket_config_copy[k] = 0
1067
+ # If all configs are 0, break the loop
1068
+ # This should not happen as request_num_mm_items is at most
1069
+ # the sum of limit_mm_per_prompt for all modalities
1070
+ if all(v == 0 for v in bucket_config_copy.values()):
1071
+ logger.warning(
1072
+ "Exhausted all multimodal items of modality %s", modality
1073
+ )
1074
+ break
1075
+ # Renormalize the bucket config
1076
+ bucket_config_copy = self.normalize_bucket_config(bucket_config_copy)
1077
+
1078
+ def sample(
1079
+ self,
1080
+ tokenizer: PreTrainedTokenizerBase,
1081
+ num_requests: int,
1082
+ request_id_prefix: str = "",
1083
+ no_oversample: bool = False,
1084
+ prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
1085
+ range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
1086
+ input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
1087
+ output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN,
1088
+ limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
1089
+ base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
1090
+ num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
1091
+ bucket_config: dict[
1092
+ tuple[int, int, int], float
1093
+ ] = DEFAULT_MM_ITEM_BUCKET_CONFIG,
1094
+ enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
1095
+ **kwargs,
1096
+ ) -> list[SampleRequest]:
1097
+ # Get the sampling parameters for the dataset
1098
+ input_lens, output_lens, offsets = self.get_sampling_params(
1099
+ num_requests, range_ratio, input_len, output_len, tokenizer
1100
+ )
1101
+
1102
+ (
1103
+ min_num_mm_items,
1104
+ max_num_mm_items,
1105
+ limit_mm_per_prompt,
1106
+ bucket_config,
1107
+ ) = self.get_mm_item_sampling_params(
1108
+ base_items_per_request,
1109
+ num_mm_items_range_ratio,
1110
+ limit_mm_per_prompt,
1111
+ bucket_config,
1112
+ )
1113
+
1114
+ vocab_size = tokenizer.vocab_size
1115
+ # Can't use tokenizer.all_special_ids since
1116
+ # it returns ONLY ids from special_tokens_map.json
1117
+ # We want to exclude placeholder tokens and all
1118
+ # tokens that indicate start/end of image as it
1119
+ # may break prompt replacement logic.
1120
+ prohibited_tokens = list(
1121
+ tok_id
1122
+ for tok_id, token in tokenizer.added_tokens_decoder.items()
1123
+ if token.special
1124
+ )
1125
+ all_tokens = np.arange(vocab_size)
1126
+ allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
1127
+ logger.debug(
1128
+ "Sampling from %d out of %d (vocab size)", len(allowed_tokens), vocab_size
1129
+ )
1130
+ # Generate prefix once
1131
+ prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len)
1132
+ # Add synthetic multimodal items to each request
1133
+ mm_requests = []
1134
+ token_mismatch_total = 0
1135
+ for i in range(num_requests):
1136
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
1137
+ tokenizer=tokenizer,
1138
+ prefix_token_ids=prefix_token_ids,
1139
+ prefix_len=prefix_len,
1140
+ vocab_size=vocab_size,
1141
+ input_len=int(input_lens[i]),
1142
+ offset=int(offsets[i]),
1143
+ index=i,
1144
+ allowed_tokens=allowed_tokens,
1145
+ )
1146
+ token_mismatch_total += token_mismatch
1147
+ # Get multimodal item iterator for a given request
1148
+ mm_item_iterator = self.get_mm_item_iterator(
1149
+ min_num_mm_items,
1150
+ max_num_mm_items,
1151
+ bucket_config,
1152
+ limit_mm_per_prompt,
1153
+ )
1154
+
1155
+ mm_content = cast(
1156
+ list[dict[str, Any]],
1157
+ [
1158
+ self.generate_mm_item(mm_item_config)
1159
+ for mm_item_config in mm_item_iterator
1160
+ ],
1161
+ )
1162
+
1163
+ if enable_multimodal_chat:
1164
+ # NOTE: For now this option is only provided for completeness
1165
+ # given that the serve.py benchmark currently does not use it.
1166
+ mm_chat_prompt: Any = prompt
1167
+ mm_chat_prompt = self.apply_multimodal_chat_transformation(
1168
+ prompt, mm_content
1169
+ )
1170
+ sample_request = SampleRequest(
1171
+ prompt=mm_chat_prompt,
1172
+ prompt_len=total_input_len,
1173
+ expected_output_len=int(output_lens[i]),
1174
+ multi_modal_data=None,
1175
+ request_id=request_id_prefix + str(i),
1176
+ )
1177
+ else:
1178
+ sample_request = SampleRequest(
1179
+ prompt=prompt,
1180
+ prompt_len=total_input_len,
1181
+ expected_output_len=int(output_lens[i]),
1182
+ multi_modal_data=mm_content,
1183
+ request_id=request_id_prefix + str(i),
1184
+ )
1185
+ mm_requests.append(sample_request)
1186
+
1187
+ if token_mismatch_total != 0:
1188
+ sign = "more" if token_mismatch_total > 0 else "fewer"
1189
+ logger.warning(
1190
+ "Across all generated prompts, there were %d %s tokens "
1191
+ "than expected after decoding and re-encoding. This is "
1192
+ "expected due to the imperfect nature of the sampling "
1193
+ "procedure.",
1194
+ abs(token_mismatch_total),
1195
+ sign,
1196
+ )
1197
+
1198
+ return mm_requests
1199
+
1200
+
1201
+ # -----------------------------------------------------------------------------
1202
+ # ShareGPT Dataset Implementation
1203
+ # -----------------------------------------------------------------------------
1204
+
1205
+
1206
+ class ShareGPTDataset(BenchmarkDataset):
1207
+ """
1208
+ Implements the ShareGPT dataset. Loads data from a JSON file and generates
1209
+ sample requests based on conversation turns.
1210
+ """
1211
+
1212
+ def __init__(self, **kwargs) -> None:
1213
+ super().__init__(**kwargs)
1214
+ self.load_data()
1215
+
1216
+ def load_data(self) -> None:
1217
+ if self.dataset_path is None:
1218
+ raise ValueError("dataset_path must be provided for loading data.")
1219
+
1220
+ with open(self.dataset_path, encoding="utf-8") as f:
1221
+ self.data = json.load(f)
1222
+ # Filter entries with at least two conversation turns.
1223
+ self.data = [
1224
+ entry
1225
+ for entry in self.data
1226
+ if "conversations" in entry and len(entry["conversations"]) >= 2
1227
+ ]
1228
+ random.seed(self.random_seed)
1229
+ if not getattr(self, "disable_shuffle", False):
1230
+ random.shuffle(self.data)
1231
+
1232
+ def sample(
1233
+ self,
1234
+ tokenizer: PreTrainedTokenizerBase,
1235
+ num_requests: int,
1236
+ lora_path: str | None = None,
1237
+ max_loras: int | None = None,
1238
+ output_len: int | None = None,
1239
+ enable_multimodal_chat: bool = False,
1240
+ request_id_prefix: str = "",
1241
+ no_oversample: bool = False,
1242
+ **kwargs,
1243
+ ) -> list:
1244
+ samples: list = []
1245
+ ind = 0
1246
+ for entry in self.data:
1247
+ if len(samples) >= num_requests:
1248
+ break
1249
+ prompt, completion = (
1250
+ entry["conversations"][0]["value"],
1251
+ entry["conversations"][1]["value"],
1252
+ )
1253
+
1254
+ lora_request = self.get_random_lora_request(
1255
+ max_loras=max_loras, lora_path=lora_path
1256
+ )
1257
+ prompt_ids = tokenizer(prompt).input_ids
1258
+ completion_ids = tokenizer(completion).input_ids
1259
+ prompt_len = len(prompt_ids)
1260
+ new_output_len = len(completion_ids) if output_len is None else output_len
1261
+ if not is_valid_sequence(
1262
+ prompt_len,
1263
+ new_output_len,
1264
+ skip_min_output_len_check=output_len is not None,
1265
+ ):
1266
+ continue
1267
+ if image_path := entry.get("image"):
1268
+ mm_content = process_image(image_path)
1269
+ elif video_path := entry.get("video"):
1270
+ mm_content = process_video(video_path)
1271
+ else:
1272
+ mm_content = None
1273
+ if enable_multimodal_chat:
1274
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
1275
+ samples.append(
1276
+ SampleRequest(
1277
+ prompt=prompt,
1278
+ prompt_len=prompt_len,
1279
+ expected_output_len=new_output_len,
1280
+ lora_request=lora_request,
1281
+ multi_modal_data=mm_content,
1282
+ request_id=request_id_prefix + str(ind),
1283
+ )
1284
+ )
1285
+ ind += 1
1286
+ self.maybe_oversample_requests(
1287
+ samples, num_requests, request_id_prefix, no_oversample
1288
+ )
1289
+ return samples
1290
+
1291
+
1292
+ class _ValidateDatasetArgs(argparse.Action):
1293
+ """Argparse action to validate dataset name and path compatibility."""
1294
+
1295
+ def __call__(self, parser, namespace, values, option_string=None):
1296
+ setattr(namespace, self.dest, values)
1297
+
1298
+ # Get current values of both dataset_name and dataset_path
1299
+ dataset_name = getattr(namespace, "dataset_name", "random")
1300
+ dataset_path = getattr(namespace, "dataset_path", None)
1301
+
1302
+ # Validate the combination
1303
+ if dataset_name == "random" and dataset_path is not None:
1304
+ parser.error(
1305
+ "Cannot use 'random' dataset with --dataset-path. "
1306
+ "Please specify the appropriate --dataset-name (e.g., "
1307
+ "'sharegpt', 'custom', 'sonnet') for your dataset file: "
1308
+ f"{dataset_path}"
1309
+ )
1310
+
1311
+
1312
+ def add_dataset_parser(parser: FlexibleArgumentParser):
1313
+ parser.add_argument("--seed", type=int, default=0)
1314
+ parser.add_argument(
1315
+ "--num-prompts",
1316
+ type=int,
1317
+ default=1000,
1318
+ help="Number of prompts to process.",
1319
+ )
1320
+ parser.add_argument(
1321
+ "--dataset-name",
1322
+ type=str,
1323
+ default="random",
1324
+ action=_ValidateDatasetArgs,
1325
+ choices=[
1326
+ "sharegpt",
1327
+ "burstgpt",
1328
+ "sonnet",
1329
+ "random",
1330
+ "random-mm",
1331
+ "random-rerank",
1332
+ "hf",
1333
+ "custom",
1334
+ "prefix_repetition",
1335
+ "spec_bench",
1336
+ ],
1337
+ help="Name of the dataset to benchmark on.",
1338
+ )
1339
+ parser.add_argument(
1340
+ "--no-stream",
1341
+ action="store_true",
1342
+ help="Do not load the dataset in streaming mode.",
1343
+ )
1344
+ parser.add_argument(
1345
+ "--dataset-path",
1346
+ type=str,
1347
+ default=None,
1348
+ action=_ValidateDatasetArgs,
1349
+ help="Path to the sharegpt/sonnet dataset. "
1350
+ "Or the huggingface dataset ID if using HF dataset.",
1351
+ )
1352
+ parser.add_argument(
1353
+ "--no-oversample",
1354
+ action="store_true",
1355
+ help="Do not oversample if the dataset has fewer samples than num-prompts.",
1356
+ )
1357
+ parser.add_argument(
1358
+ "--skip-chat-template",
1359
+ action="store_true",
1360
+ help="Skip applying chat template to prompt for datasets that support it.",
1361
+ )
1362
+ parser.add_argument(
1363
+ "--disable-shuffle",
1364
+ action="store_true",
1365
+ help="Disable shuffling of dataset samples for deterministic ordering.",
1366
+ )
1367
+
1368
+ # group for dataset specific arguments
1369
+ custom_group = parser.add_argument_group("custom dataset options")
1370
+ custom_group.add_argument(
1371
+ "--custom-output-len",
1372
+ type=int,
1373
+ default=256,
1374
+ help="Number of output tokens per request, used only for custom dataset.",
1375
+ )
1376
+
1377
+ spec_bench_group = parser.add_argument_group("spec bench dataset options")
1378
+ spec_bench_group.add_argument(
1379
+ "--spec-bench-output-len",
1380
+ type=int,
1381
+ default=256,
1382
+ help="Num of output tokens per request, used only for spec bench dataset.",
1383
+ )
1384
+ spec_bench_group.add_argument(
1385
+ "--spec-bench-category",
1386
+ type=str,
1387
+ default=None,
1388
+ help="Category for spec bench dataset. If None, use all categories.",
1389
+ )
1390
+
1391
+ sonnet_group = parser.add_argument_group("sonnet dataset options")
1392
+ sonnet_group.add_argument(
1393
+ "--sonnet-input-len",
1394
+ type=int,
1395
+ default=550,
1396
+ help="Number of input tokens per request, used only for sonnet dataset.",
1397
+ )
1398
+ sonnet_group.add_argument(
1399
+ "--sonnet-output-len",
1400
+ type=int,
1401
+ default=150,
1402
+ help="Number of output tokens per request, used only for sonnet dataset.",
1403
+ )
1404
+ sonnet_group.add_argument(
1405
+ "--sonnet-prefix-len",
1406
+ type=int,
1407
+ default=200,
1408
+ help="Number of prefix tokens per request, used only for sonnet dataset.",
1409
+ )
1410
+
1411
+ sharegpt_group = parser.add_argument_group("sharegpt dataset options")
1412
+ sharegpt_group.add_argument(
1413
+ "--sharegpt-output-len",
1414
+ type=int,
1415
+ default=None,
1416
+ help="Output length for each request. Overrides the output length "
1417
+ "from the ShareGPT dataset.",
1418
+ )
1419
+
1420
+ blazedit_group = parser.add_argument_group("blazedit dataset options")
1421
+ blazedit_group.add_argument(
1422
+ "--blazedit-min-distance",
1423
+ type=float,
1424
+ default=0.0,
1425
+ help="Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
1426
+ )
1427
+ blazedit_group.add_argument(
1428
+ "--blazedit-max-distance",
1429
+ type=float,
1430
+ default=1.0,
1431
+ help="Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
1432
+ )
1433
+
1434
+ random_group = parser.add_argument_group("random dataset options")
1435
+ random_group.add_argument(
1436
+ "--random-input-len",
1437
+ type=int,
1438
+ default=1024,
1439
+ help="Number of input tokens per request, used only for random sampling.",
1440
+ )
1441
+ random_group.add_argument(
1442
+ "--random-output-len",
1443
+ type=int,
1444
+ default=128,
1445
+ help="Number of output tokens per request, used only for random sampling.",
1446
+ )
1447
+ random_group.add_argument(
1448
+ "--random-range-ratio",
1449
+ type=float,
1450
+ default=0.0,
1451
+ help="Range ratio for sampling input/output length, "
1452
+ "used only for random sampling. Must be in the range [0, 1) to define "
1453
+ "a symmetric sampling range"
1454
+ "[length * (1 - range_ratio), length * (1 + range_ratio)].",
1455
+ )
1456
+ random_group.add_argument(
1457
+ "--random-prefix-len",
1458
+ type=int,
1459
+ default=0,
1460
+ help=(
1461
+ "Number of fixed prefix tokens before the random context "
1462
+ "in a request. "
1463
+ "The total input length is the sum of `random-prefix-len` and "
1464
+ "a random "
1465
+ "context length sampled from [input_len * (1 - range_ratio), "
1466
+ "input_len * (1 + range_ratio)]."
1467
+ ),
1468
+ )
1469
+ random_group.add_argument(
1470
+ "--random-batch-size",
1471
+ type=int,
1472
+ default=1,
1473
+ help=("Batch size for random sampling. Only used for embeddings benchmark."),
1474
+ )
1475
+ random_group.add_argument(
1476
+ "--no-reranker",
1477
+ action="store_true",
1478
+ help=(
1479
+ "Whether the model supports reranking natively."
1480
+ " Only used for reranker benchmark."
1481
+ ),
1482
+ )
1483
+
1484
+ # random multimodal dataset options
1485
+ random_mm_group = parser.add_argument_group(
1486
+ "random multimodal dataset options extended from random dataset"
1487
+ )
1488
+ random_mm_group.add_argument(
1489
+ "--random-mm-base-items-per-request",
1490
+ type=int,
1491
+ default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST,
1492
+ help=(
1493
+ "Base number of multimodal items per request for random-mm. "
1494
+ "Actual per-request count is sampled around this base using "
1495
+ "--random-mm-num-mm-items-range-ratio."
1496
+ ),
1497
+ )
1498
+ random_mm_group.add_argument(
1499
+ "--random-mm-num-mm-items-range-ratio",
1500
+ type=float,
1501
+ default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
1502
+ help=(
1503
+ "Range ratio r in [0, 1] for sampling items per request. "
1504
+ "We sample uniformly from the closed integer range "
1505
+ "[floor(n*(1-r)), ceil(n*(1+r))] "
1506
+ "where n is the base items per request. "
1507
+ "r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped "
1508
+ "to the sum of per-modality limits from "
1509
+ "--random-mm-limit-mm-per-prompt. "
1510
+ "An error is raised if the computed min exceeds the max."
1511
+ ),
1512
+ )
1513
+ random_mm_group.add_argument(
1514
+ "--random-mm-limit-mm-per-prompt",
1515
+ type=json.loads,
1516
+ default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT,
1517
+ help=(
1518
+ "Per-modality hard caps for items attached per request, e.g. "
1519
+ '\'{"image": 3, "video": 0}\'. The sampled per-request item '
1520
+ "count is clamped to the sum of these limits. When a modality "
1521
+ "reaches its cap, its buckets are excluded and probabilities are "
1522
+ "renormalized."
1523
+ "OBS.: Only image sampling is supported for now."
1524
+ ),
1525
+ )
1526
+
1527
+ def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]:
1528
+ # If already a dict (e.g., programmatic call), normalize keys
1529
+ def normalize(d: dict) -> dict[tuple[int, int, int], float]:
1530
+ out: dict[tuple[int, int, int], float] = {}
1531
+ for k, val in d.items():
1532
+ key = k
1533
+ if isinstance(key, str):
1534
+ with suppress(Exception):
1535
+ key = ast.literal_eval(key)
1536
+ if not (
1537
+ isinstance(key, tuple)
1538
+ and len(key) == 3
1539
+ and all(isinstance(x, int) for x in key)
1540
+ ):
1541
+ raise ValueError(
1542
+ f"Invalid bucket key {k!r}. Expected tuple (H, W, T)."
1543
+ )
1544
+ out[(int(key[0]), int(key[1]), int(key[2]))] = float(val)
1545
+ return out
1546
+
1547
+ if isinstance(v, dict):
1548
+ return normalize(v)
1549
+ if isinstance(v, str):
1550
+ # Python literal (supports tuple keys)
1551
+ parsed = ast.literal_eval(v)
1552
+ if not isinstance(parsed, dict):
1553
+ raise ValueError("Bucket config must parse to a dict.")
1554
+ return normalize(parsed)
1555
+ raise ValueError("Unsupported value for --random-mm-bucket-config.")
1556
+
1557
+ random_mm_group.add_argument(
1558
+ "--random-mm-bucket-config",
1559
+ type=_parse_mm_bucket_config,
1560
+ default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG,
1561
+ help=(
1562
+ "The bucket config is a dictionary mapping a multimodal item"
1563
+ "sampling configuration to a probability."
1564
+ "Currently allows for 2 modalities: images and videos. "
1565
+ "An bucket key is a tuple of (height, width, num_frames)"
1566
+ "The value is the probability of sampling that specific item. "
1567
+ "Example: "
1568
+ "--random-mm-bucket-config "
1569
+ "{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} "
1570
+ "First item: images with resolution 256x256 w.p. 0.5"
1571
+ "Second item: images with resolution 720x1280 w.p. 0.4 "
1572
+ "Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1"
1573
+ "OBS.: If the probabilities do not sum to 1, they are normalized."
1574
+ "OBS bis.: Only image sampling is supported for now."
1575
+ ),
1576
+ )
1577
+
1578
+ hf_group = parser.add_argument_group("hf dataset options")
1579
+ hf_group.add_argument(
1580
+ "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
1581
+ )
1582
+ hf_group.add_argument(
1583
+ "--hf-split", type=str, default=None, help="Split of the HF dataset."
1584
+ )
1585
+ hf_group.add_argument(
1586
+ "--hf-name",
1587
+ type=str,
1588
+ default=None,
1589
+ help=(
1590
+ "Name of the dataset on HuggingFace "
1591
+ "(e.g., 'lmarena-ai/VisionArena-Chat'). "
1592
+ "Specify this if your dataset-path is a local path."
1593
+ ),
1594
+ )
1595
+ hf_group.add_argument(
1596
+ "--hf-output-len",
1597
+ type=int,
1598
+ default=None,
1599
+ help="Output length for each request. Overrides the output lengths "
1600
+ "from the sampled HF dataset.",
1601
+ )
1602
+
1603
+ prefix_repetition_group = parser.add_argument_group(
1604
+ "prefix repetition dataset options"
1605
+ )
1606
+ prefix_repetition_group.add_argument(
1607
+ "--prefix-repetition-prefix-len",
1608
+ type=int,
1609
+ default=256,
1610
+ help="Number of prefix tokens per request, used only for prefix "
1611
+ "repetition dataset.",
1612
+ )
1613
+ prefix_repetition_group.add_argument(
1614
+ "--prefix-repetition-suffix-len",
1615
+ type=int,
1616
+ default=256,
1617
+ help="Number of suffix tokens per request, used only for prefix "
1618
+ "repetition dataset. Total input length is prefix_len + suffix_len.",
1619
+ )
1620
+ prefix_repetition_group.add_argument(
1621
+ "--prefix-repetition-num-prefixes",
1622
+ type=int,
1623
+ default=10,
1624
+ help="Number of prefixes to generate, used only for prefix repetition "
1625
+ "dataset. Prompts per prefix is num_requests // num_prefixes.",
1626
+ )
1627
+ prefix_repetition_group.add_argument(
1628
+ "--prefix-repetition-output-len",
1629
+ type=int,
1630
+ default=128,
1631
+ help="Number of output tokens per request, used only for prefix "
1632
+ "repetition dataset.",
1633
+ )
1634
+
1635
+
1636
+ def get_samples(args, tokenizer) -> list[SampleRequest]:
1637
+ if not hasattr(args, "request_id_prefix"):
1638
+ args.request_id_prefix = ""
1639
+
1640
+ if args.dataset_name == "custom":
1641
+ dataset = CustomDataset(
1642
+ dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
1643
+ )
1644
+ input_requests = dataset.sample(
1645
+ num_requests=args.num_prompts,
1646
+ tokenizer=tokenizer,
1647
+ output_len=args.custom_output_len,
1648
+ skip_chat_template=args.skip_chat_template,
1649
+ request_id_prefix=args.request_id_prefix,
1650
+ no_oversample=args.no_oversample,
1651
+ )
1652
+
1653
+ elif args.dataset_name == "sonnet":
1654
+ dataset = SonnetDataset(
1655
+ dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
1656
+ )
1657
+ # For the "sonnet" dataset, formatting depends on the backend.
1658
+ if args.backend == "openai-chat":
1659
+ input_requests = dataset.sample(
1660
+ num_requests=args.num_prompts,
1661
+ input_len=args.sonnet_input_len,
1662
+ output_len=args.sonnet_output_len,
1663
+ prefix_len=args.sonnet_prefix_len,
1664
+ tokenizer=tokenizer,
1665
+ return_prompt_formatted=False,
1666
+ request_id_prefix=args.request_id_prefix,
1667
+ no_oversample=args.no_oversample,
1668
+ )
1669
+ else:
1670
+ assert tokenizer.chat_template or tokenizer.default_chat_template, (
1671
+ "Tokenizer/model must have chat template for sonnet dataset."
1672
+ )
1673
+ input_requests = dataset.sample(
1674
+ num_requests=args.num_prompts,
1675
+ input_len=args.sonnet_input_len,
1676
+ output_len=args.sonnet_output_len,
1677
+ prefix_len=args.sonnet_prefix_len,
1678
+ tokenizer=tokenizer,
1679
+ return_prompt_formatted=True,
1680
+ request_id_prefix=args.request_id_prefix,
1681
+ no_oversample=args.no_oversample,
1682
+ )
1683
+
1684
+ elif args.dataset_name == "hf":
1685
+ # all following datasets are implemented from the
1686
+ # HuggingFaceDataset base class
1687
+ hf_kwargs = {}
1688
+ if (
1689
+ args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1690
+ or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1691
+ ):
1692
+ dataset_class = VisionArenaDataset
1693
+ args.hf_split = "train"
1694
+ args.hf_subset = None
1695
+ elif (
1696
+ args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS
1697
+ or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS
1698
+ ):
1699
+ dataset_class = MMVUDataset
1700
+ args.hf_split = "validation"
1701
+ args.hf_subset = None
1702
+ elif (
1703
+ args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1704
+ or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1705
+ ):
1706
+ dataset_class = InstructCoderDataset
1707
+ args.hf_split = "train"
1708
+ elif (
1709
+ args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
1710
+ or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
1711
+ ):
1712
+ dataset_class = MTBenchDataset
1713
+ args.hf_split = "train"
1714
+ elif (
1715
+ args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
1716
+ or args.hf_name in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
1717
+ ):
1718
+ dataset_class = MultiModalConversationDataset
1719
+ elif (
1720
+ args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
1721
+ or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
1722
+ ):
1723
+ dataset_class = ConversationDataset
1724
+ elif (
1725
+ args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
1726
+ or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
1727
+ ):
1728
+ dataset_class = AIMODataset
1729
+ args.hf_split = "train"
1730
+ elif (
1731
+ args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501
1732
+ or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
1733
+ ):
1734
+ dataset_class = NextEditPredictionDataset
1735
+ args.hf_split = "train"
1736
+ elif (
1737
+ args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
1738
+ or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
1739
+ ):
1740
+ dataset_class = ASRDataset
1741
+ args.hf_split = "train"
1742
+ elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS:
1743
+ dataset_class = BlazeditDataset
1744
+ args.hf_split = "train"
1745
+ hf_kwargs = {
1746
+ "min_distance": args.blazedit_min_distance,
1747
+ "max_distance": args.blazedit_max_distance,
1748
+ }
1749
+ elif (
1750
+ args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
1751
+ or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
1752
+ ):
1753
+ dataset_class = MLPerfDataset
1754
+ args.hf_split = "train"
1755
+ elif (
1756
+ args.dataset_path in MMStarDataset.SUPPORTED_DATASET_PATHS
1757
+ or args.hf_name in MMStarDataset.SUPPORTED_DATASET_PATHS
1758
+ ):
1759
+ dataset_class = MMStarDataset
1760
+ args.hf_split = "val"
1761
+ args.hf_subset = None
1762
+ else:
1763
+ supported_datasets = set(
1764
+ [
1765
+ dataset_name
1766
+ for cls in HuggingFaceDataset.__subclasses__()
1767
+ for dataset_name in cls.SUPPORTED_DATASET_PATHS
1768
+ ]
1769
+ )
1770
+ raise ValueError(
1771
+ f"Unsupported dataset path: {args.dataset_path}. "
1772
+ "Huggingface dataset only supports dataset_path"
1773
+ f" from one of following: {supported_datasets}. "
1774
+ "Please consider contributing if you would "
1775
+ "like to add support for additional dataset formats."
1776
+ )
1777
+
1778
+ if dataset_class.IS_MULTIMODAL and not (
1779
+ args.backend in ("openai-chat", "openai-audio")
1780
+ or "embeddings-" in args.backend
1781
+ ):
1782
+ # multi-modal benchmark is only available on OpenAI Chat
1783
+ # endpoint-type.
1784
+ raise ValueError(
1785
+ "Multi-modal content is only supported on 'openai-chat' and "
1786
+ "'openai-audio' backends."
1787
+ )
1788
+ input_requests = dataset_class(
1789
+ dataset_path=args.dataset_path,
1790
+ dataset_subset=args.hf_subset,
1791
+ dataset_split=args.hf_split,
1792
+ random_seed=args.seed,
1793
+ no_stream=args.no_stream,
1794
+ hf_name=args.hf_name,
1795
+ disable_shuffle=args.disable_shuffle,
1796
+ ).sample(
1797
+ num_requests=args.num_prompts,
1798
+ tokenizer=tokenizer,
1799
+ output_len=args.hf_output_len,
1800
+ request_id_prefix=args.request_id_prefix,
1801
+ no_oversample=args.no_oversample,
1802
+ skip_chat_template=args.skip_chat_template,
1803
+ **hf_kwargs,
1804
+ )
1805
+
1806
+ else:
1807
+ # For datasets that follow a similar structure, use a mapping.
1808
+ dataset_mapping = {
1809
+ "spec_bench": lambda: SpecBench(
1810
+ dataset_path=args.dataset_path,
1811
+ category=args.spec_bench_category,
1812
+ disable_shuffle=args.disable_shuffle,
1813
+ ).sample(
1814
+ num_requests=args.num_prompts,
1815
+ tokenizer=tokenizer,
1816
+ output_len=args.spec_bench_output_len,
1817
+ request_id_prefix=args.request_id_prefix,
1818
+ no_oversample=args.no_oversample,
1819
+ ),
1820
+ "sharegpt": lambda: ShareGPTDataset(
1821
+ random_seed=args.seed,
1822
+ dataset_path=args.dataset_path,
1823
+ disable_shuffle=args.disable_shuffle,
1824
+ ).sample(
1825
+ tokenizer=tokenizer,
1826
+ num_requests=args.num_prompts,
1827
+ output_len=args.sharegpt_output_len,
1828
+ request_id_prefix=args.request_id_prefix,
1829
+ no_oversample=args.no_oversample,
1830
+ ),
1831
+ "burstgpt": lambda: BurstGPTDataset(
1832
+ random_seed=args.seed,
1833
+ dataset_path=args.dataset_path,
1834
+ disable_shuffle=args.disable_shuffle,
1835
+ ).sample(
1836
+ tokenizer=tokenizer,
1837
+ num_requests=args.num_prompts,
1838
+ request_id_prefix=args.request_id_prefix,
1839
+ no_oversample=args.no_oversample,
1840
+ ),
1841
+ "random": lambda: RandomDataset(
1842
+ random_seed=args.seed,
1843
+ dataset_path=args.dataset_path,
1844
+ disable_shuffle=args.disable_shuffle,
1845
+ ).sample(
1846
+ tokenizer=tokenizer,
1847
+ num_requests=args.num_prompts,
1848
+ prefix_len=args.random_prefix_len,
1849
+ input_len=args.random_input_len,
1850
+ output_len=args.random_output_len,
1851
+ range_ratio=args.random_range_ratio,
1852
+ request_id_prefix=args.request_id_prefix,
1853
+ batchsize=args.random_batch_size,
1854
+ no_oversample=args.no_oversample,
1855
+ ),
1856
+ "random-mm": lambda: RandomMultiModalDataset(
1857
+ random_seed=args.seed,
1858
+ dataset_path=args.dataset_path,
1859
+ disable_shuffle=args.disable_shuffle,
1860
+ ).sample(
1861
+ tokenizer=tokenizer,
1862
+ num_requests=args.num_prompts,
1863
+ prefix_len=args.random_prefix_len,
1864
+ range_ratio=args.random_range_ratio,
1865
+ input_len=args.random_input_len,
1866
+ output_len=args.random_output_len,
1867
+ base_items_per_request=args.random_mm_base_items_per_request,
1868
+ limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
1869
+ num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
1870
+ bucket_config=args.random_mm_bucket_config,
1871
+ request_id_prefix=args.request_id_prefix,
1872
+ no_oversample=args.no_oversample,
1873
+ ),
1874
+ "random-rerank": lambda: RandomDatasetForReranking(
1875
+ random_seed=args.seed,
1876
+ dataset_path=args.dataset_path,
1877
+ disable_shuffle=args.disable_shuffle,
1878
+ ).sample(
1879
+ tokenizer=tokenizer,
1880
+ num_requests=args.num_prompts,
1881
+ input_len=args.random_input_len,
1882
+ range_ratio=args.random_range_ratio,
1883
+ request_id_prefix=args.request_id_prefix,
1884
+ batchsize=args.random_batch_size,
1885
+ is_reranker=not args.no_reranker,
1886
+ ),
1887
+ "prefix_repetition": lambda: PrefixRepetitionRandomDataset(
1888
+ random_seed=args.seed,
1889
+ dataset_path=args.dataset_path,
1890
+ disable_shuffle=args.disable_shuffle,
1891
+ ).sample(
1892
+ tokenizer=tokenizer,
1893
+ num_requests=args.num_prompts,
1894
+ prefix_len=args.prefix_repetition_prefix_len,
1895
+ suffix_len=args.prefix_repetition_suffix_len,
1896
+ num_prefixes=args.prefix_repetition_num_prefixes,
1897
+ output_len=args.prefix_repetition_output_len,
1898
+ request_id_prefix=args.request_id_prefix,
1899
+ no_oversample=args.no_oversample,
1900
+ ),
1901
+ }
1902
+
1903
+ try:
1904
+ # Enforce endpoint compatibility for multimodal datasets.
1905
+ if args.dataset_name == "random-mm" and args.backend not in ["openai-chat"]:
1906
+ raise ValueError(
1907
+ "Multi-modal content (images) is only supported on "
1908
+ "'openai-chat' backend."
1909
+ )
1910
+ input_requests = dataset_mapping[args.dataset_name]()
1911
+ except KeyError as err:
1912
+ raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
1913
+
1914
+ return input_requests
1915
+
1916
+
1917
+ # -----------------------------------------------------------------------------
1918
+ # Custom Dataset Implementation
1919
+ # -----------------------------------------------------------------------------
1920
+
1921
+
1922
+ class CustomDataset(BenchmarkDataset):
1923
+ """
1924
+ Implements the Custom dataset. Loads data from a JSONL file and generates
1925
+ sample requests based on conversation turns. E.g.,
1926
+ ```
1927
+ {"prompt": "What is the capital of India?"}
1928
+ {"prompt": "What is the capital of Iran?"}
1929
+ {"prompt": "What is the capital of China?"}
1930
+ ```
1931
+ """
1932
+
1933
+ def __init__(self, **kwargs) -> None:
1934
+ super().__init__(**kwargs)
1935
+ self.load_data()
1936
+
1937
+ def load_data(self) -> None:
1938
+ if self.dataset_path is None:
1939
+ raise ValueError("dataset_path must be provided for loading data.")
1940
+
1941
+ # self.data will be a list of dictionaries
1942
+ # e.g., [{"prompt": "What is the capital of India?"}, ...]
1943
+ # This will be the standardized format which load_data()
1944
+ # has to convert into depending on the filetype of dataset_path.
1945
+ # sample() will assume this standardized format of self.data
1946
+ self.data = []
1947
+
1948
+ # Load the JSONL file
1949
+ if self.dataset_path.endswith(".jsonl"):
1950
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
1951
+
1952
+ # check if the JSONL file has a 'prompt' column
1953
+ if "prompt" not in jsonl_data.columns:
1954
+ raise ValueError("JSONL file must contain a 'prompt' column.")
1955
+
1956
+ # Convert each row to a dictionary and append to self.data
1957
+ # This will convert the DataFrame to a list of dictionaries
1958
+ # where each dictionary corresponds to a row in the DataFrame.
1959
+ # This is the standardized format we want for self.data
1960
+ for _, row in jsonl_data.iterrows():
1961
+ self.data.append(row.to_dict())
1962
+ else:
1963
+ raise NotImplementedError(
1964
+ "Only JSONL format is supported for CustomDataset."
1965
+ )
1966
+
1967
+ random.seed(self.random_seed)
1968
+ if not getattr(self, "disable_shuffle", False):
1969
+ random.shuffle(self.data)
1970
+
1971
+ def sample(
1972
+ self,
1973
+ tokenizer: PreTrainedTokenizerBase,
1974
+ num_requests: int,
1975
+ lora_path: str | None = None,
1976
+ max_loras: int | None = None,
1977
+ output_len: int | None = None,
1978
+ enable_multimodal_chat: bool = False,
1979
+ skip_chat_template: bool = False,
1980
+ request_id_prefix: str = "",
1981
+ no_oversample: bool = False,
1982
+ **kwargs,
1983
+ ) -> list:
1984
+ # load all data if needed
1985
+ self.num_available_samples = len(self.data)
1986
+ if num_requests <= 0:
1987
+ num_requests = self.num_available_samples
1988
+ logger.info(
1989
+ "num_requests is set to 0 or negative, "
1990
+ "so using all available samples: %d",
1991
+ num_requests,
1992
+ )
1993
+
1994
+ sampled_requests = []
1995
+ for i, item in enumerate(self.data):
1996
+ if len(sampled_requests) >= num_requests:
1997
+ break
1998
+ prompt = item["prompt"]
1999
+
2000
+ # apply template
2001
+ if not skip_chat_template:
2002
+ prompt = tokenizer.apply_chat_template(
2003
+ [{"role": "user", "content": prompt}],
2004
+ add_generation_prompt=True,
2005
+ tokenize=False,
2006
+ )
2007
+
2008
+ prompt_len = len(tokenizer(prompt).input_ids)
2009
+ sampled_requests.append(
2010
+ SampleRequest(
2011
+ prompt=prompt,
2012
+ prompt_len=prompt_len,
2013
+ expected_output_len=output_len,
2014
+ request_id=request_id_prefix + str(i),
2015
+ )
2016
+ )
2017
+ self.maybe_oversample_requests(
2018
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2019
+ )
2020
+
2021
+ return sampled_requests
2022
+
2023
+
2024
+ # -----------------------------------------------------------------------------
2025
+ # Spec Bench Dataset Implementation
2026
+ # -----------------------------------------------------------------------------
2027
+
2028
+
2029
+ class SpecBench(CustomDataset):
2030
+ """
2031
+ Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
2032
+ Download the dataset using:
2033
+ wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
2034
+ """ # noqa: E501
2035
+
2036
+ def __init__(self, **kwargs) -> None:
2037
+ self.category = kwargs.pop("category", None)
2038
+ super().__init__(**kwargs)
2039
+ self.load_data()
2040
+
2041
+ def load_data(self) -> None:
2042
+ if self.dataset_path is None:
2043
+ raise ValueError("dataset_path must be provided for loading data.")
2044
+
2045
+ self.data = []
2046
+
2047
+ # Load the JSONL file
2048
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
2049
+
2050
+ # check if the JSONL file has a 'turns' column
2051
+ if "turns" not in jsonl_data.columns:
2052
+ raise ValueError("JSONL file must contain a 'turns' column.")
2053
+
2054
+ for _, row in jsonl_data.iterrows():
2055
+ # sample only from a specific category if specified
2056
+ if (not self.category) or (self.category == row["category"]):
2057
+ prompt = row["turns"][0]
2058
+ self.data.append({"prompt": prompt})
2059
+
2060
+ random.seed(self.random_seed)
2061
+ if not getattr(self, "disable_shuffle", False):
2062
+ random.shuffle(self.data)
2063
+
2064
+ def sample(self, **kwargs) -> list:
2065
+ # leverage CustomDataset sample
2066
+ return super().sample(**kwargs)
2067
+
2068
+
2069
+ # -----------------------------------------------------------------------------
2070
+ # Sonnet Dataset Implementation
2071
+ # -----------------------------------------------------------------------------
2072
+
2073
+
2074
+ @deprecated(
2075
+ "SonnetDataset is deprecated and will be removed in a future version.",
2076
+ )
2077
+ class SonnetDataset(BenchmarkDataset):
2078
+ """
2079
+ Simplified implementation of the Sonnet dataset. Loads poem lines from a
2080
+ text file and generates sample requests. Default values here copied from
2081
+ `benchmark_serving.py` for the sonnet dataset.
2082
+ """
2083
+
2084
+ DEFAULT_PREFIX_LEN = 200
2085
+ DEFAULT_INPUT_LEN = 550
2086
+ DEFAULT_OUTPUT_LEN = 150
2087
+
2088
+ def __init__(
2089
+ self,
2090
+ **kwargs,
2091
+ ) -> None:
2092
+ super().__init__(**kwargs)
2093
+ self.load_data()
2094
+
2095
+ def load_data(self) -> None:
2096
+ if not self.dataset_path:
2097
+ raise ValueError("dataset_path must be provided.")
2098
+ with open(self.dataset_path, encoding="utf-8") as f:
2099
+ self.data = f.readlines()
2100
+
2101
+ def sample(
2102
+ self,
2103
+ tokenizer,
2104
+ num_requests: int,
2105
+ prefix_len: int = DEFAULT_PREFIX_LEN,
2106
+ input_len: int = DEFAULT_INPUT_LEN,
2107
+ output_len: int = DEFAULT_OUTPUT_LEN,
2108
+ return_prompt_formatted: bool = False,
2109
+ request_id_prefix: str = "",
2110
+ no_oversample: bool = False,
2111
+ **kwargs,
2112
+ ) -> list:
2113
+ # Calculate average token length for a poem line.
2114
+ tokenized_lines = [tokenizer(line).input_ids for line in self.data]
2115
+ avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
2116
+
2117
+ # Build the base prompt.
2118
+ base_prompt = "Pick as many lines as you can from these poem lines:\n"
2119
+ base_msg = [{"role": "user", "content": base_prompt}]
2120
+ base_fmt = tokenizer.apply_chat_template(
2121
+ base_msg, add_generation_prompt=True, tokenize=False
2122
+ )
2123
+ base_offset = len(tokenizer(base_fmt).input_ids)
2124
+ if input_len <= base_offset:
2125
+ raise ValueError(
2126
+ f"'input_len' must be higher than the base prompt length "
2127
+ f"({base_offset})."
2128
+ )
2129
+
2130
+ # Determine how many poem lines to use.
2131
+ num_input_lines = round((input_len - base_offset) / avg_len)
2132
+ num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
2133
+ prefix_lines = self.data[:num_prefix_lines]
2134
+
2135
+ samples = []
2136
+ ind = 0
2137
+ while len(samples) < num_requests:
2138
+ extra_lines = random.choices(
2139
+ self.data, k=num_input_lines - num_prefix_lines
2140
+ )
2141
+ prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
2142
+ msg = [{"role": "user", "content": prompt}]
2143
+ prompt_formatted = tokenizer.apply_chat_template(
2144
+ msg, add_generation_prompt=True, tokenize=False
2145
+ )
2146
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
2147
+ if prompt_len <= input_len:
2148
+ samples.append(
2149
+ SampleRequest(
2150
+ prompt=prompt_formatted if return_prompt_formatted else prompt,
2151
+ prompt_len=prompt_len,
2152
+ expected_output_len=output_len,
2153
+ request_id=request_id_prefix + str(ind),
2154
+ )
2155
+ )
2156
+ ind += 1
2157
+ return samples
2158
+
2159
+
2160
+ # -----------------------------------------------------------------------------
2161
+ # BurstGPT Dataset Implementation
2162
+ # -----------------------------------------------------------------------------
2163
+
2164
+
2165
+ class BurstGPTDataset(BenchmarkDataset):
2166
+ """
2167
+ Implements the BurstGPT dataset. Loads data from a CSV file and generates
2168
+ sample requests based on synthetic prompt generation. Only rows with Model
2169
+ "GPT-4" and positive response tokens are used.
2170
+ """
2171
+
2172
+ def __init__(self, **kwargs) -> None:
2173
+ super().__init__(**kwargs)
2174
+ self.load_data()
2175
+
2176
+ def load_data(
2177
+ self,
2178
+ ):
2179
+ if self.dataset_path is None:
2180
+ raise ValueError("dataset_path must be provided for loading data.")
2181
+
2182
+ df = pd.read_csv(self.dataset_path)
2183
+ # Filter to keep only GPT-4 rows.
2184
+ gpt4_df = df[df["Model"] == "GPT-4"]
2185
+ # Remove failed requests (where Response tokens is 0 or less).
2186
+ gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
2187
+ # Sample the desired number of rows.
2188
+ self.data = gpt4_df
2189
+
2190
+ def _sample_loaded_data(self, num_requests: int) -> list:
2191
+ if num_requests <= len(self.data):
2192
+ data = self.data.sample(n=num_requests, random_state=self.random_seed)
2193
+ else:
2194
+ data = self.data.sample(
2195
+ n=num_requests,
2196
+ random_state=self.random_seed,
2197
+ replace=True,
2198
+ )
2199
+ # Convert the dataframe to a list of lists.
2200
+ return data.values.tolist()
2201
+
2202
+ def sample(
2203
+ self,
2204
+ tokenizer: PreTrainedTokenizerBase,
2205
+ num_requests: int,
2206
+ max_loras: int | None = None,
2207
+ lora_path: str | None = None,
2208
+ request_id_prefix: str = "",
2209
+ no_oversample: bool = False,
2210
+ **kwargs,
2211
+ ) -> list[SampleRequest]:
2212
+ samples = []
2213
+ data = self._sample_loaded_data(num_requests=num_requests)
2214
+ for i in range(num_requests):
2215
+ input_len = int(data[i][2])
2216
+ output_len = int(data[i][3])
2217
+ lora_req = self.get_random_lora_request(
2218
+ max_loras=max_loras, lora_path=lora_path
2219
+ )
2220
+ vocab_size = tokenizer.vocab_size
2221
+ # Generate a synthetic prompt: a list of token IDs computed as (i +
2222
+ # j) modulo vocab_size.
2223
+ token_ids = [(i + j) % vocab_size for j in range(input_len)]
2224
+ prompt = tokenizer.decode(token_ids)
2225
+ samples.append(
2226
+ SampleRequest(
2227
+ prompt=prompt,
2228
+ prompt_len=input_len,
2229
+ expected_output_len=output_len,
2230
+ lora_request=lora_req,
2231
+ request_id=request_id_prefix + str(i),
2232
+ )
2233
+ )
2234
+ return samples
2235
+
2236
+
2237
+ # -----------------------------------------------------------------------------
2238
+ # HuggingFace Dataset Base Implementation
2239
+ # -----------------------------------------------------------------------------
2240
+ class HuggingFaceDataset(BenchmarkDataset):
2241
+ """Base class for datasets hosted on HuggingFace."""
2242
+
2243
+ SUPPORTED_DATASET_PATHS: set[str] | dict[str, Callable] = set()
2244
+
2245
+ def __init__(
2246
+ self,
2247
+ dataset_path: str,
2248
+ dataset_split: str,
2249
+ no_stream: bool = False,
2250
+ dataset_subset: str | None = None,
2251
+ hf_name: str | None = None,
2252
+ **kwargs,
2253
+ ) -> None:
2254
+ super().__init__(dataset_path=dataset_path, **kwargs)
2255
+
2256
+ self.dataset_split = dataset_split
2257
+ self.dataset_subset = dataset_subset
2258
+ self.load_stream = not no_stream
2259
+ self.hf_name = hf_name or dataset_path
2260
+ self.load_data()
2261
+
2262
+ def load_data(self) -> None:
2263
+ """Load data from HuggingFace datasets."""
2264
+ self.data = load_dataset(
2265
+ self.dataset_path,
2266
+ name=self.dataset_subset,
2267
+ split=self.dataset_split,
2268
+ streaming=self.load_stream,
2269
+ )
2270
+ if not getattr(self, "disable_shuffle", False):
2271
+ self.data = self.data.shuffle(seed=self.random_seed)
2272
+
2273
+
2274
+ # -----------------------------------------------------------------------------
2275
+ # Conversation Dataset Implementation
2276
+ # -----------------------------------------------------------------------------
2277
+
2278
+
2279
+ class ConversationDataset(HuggingFaceDataset):
2280
+ """Dataset for text-only conversation data."""
2281
+
2282
+ SUPPORTED_DATASET_PATHS = {
2283
+ "Aeala/ShareGPT_Vicuna_unfiltered",
2284
+ }
2285
+ IS_MULTIMODAL = False
2286
+
2287
+ def sample(
2288
+ self,
2289
+ tokenizer: PreTrainedTokenizerBase,
2290
+ num_requests: int,
2291
+ output_len: int | None = None,
2292
+ enable_multimodal_chat: bool = False,
2293
+ request_id_prefix: str = "",
2294
+ no_oversample: bool = False,
2295
+ **kwargs,
2296
+ ) -> list:
2297
+ # Filter examples with at least 2 conversations
2298
+ filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
2299
+ sampled_requests = []
2300
+ ind = 0
2301
+ dynamic_output = output_len is None
2302
+
2303
+ for item in filtered_data:
2304
+ if len(sampled_requests) >= num_requests:
2305
+ break
2306
+ conv = item["conversations"]
2307
+ prompt, completion = conv[0]["value"], conv[1]["value"]
2308
+
2309
+ prompt_ids = tokenizer(prompt).input_ids
2310
+ completion_ids = tokenizer(completion).input_ids
2311
+ prompt_len = len(prompt_ids)
2312
+ completion_len = len(completion_ids)
2313
+ output_len = completion_len if dynamic_output else output_len
2314
+ assert isinstance(output_len, int) and output_len > 0
2315
+ if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
2316
+ continue
2317
+ mm_content = process_image(item["image"]) if "image" in item else None
2318
+ if enable_multimodal_chat:
2319
+ # Note: when chat is enabled the request prompt_len is no longer
2320
+ # accurate and we will be using request output to count the
2321
+ # actual prompt len and output len
2322
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2323
+ sampled_requests.append(
2324
+ SampleRequest(
2325
+ prompt=prompt,
2326
+ prompt_len=prompt_len,
2327
+ expected_output_len=output_len,
2328
+ multi_modal_data=mm_content,
2329
+ request_id=request_id_prefix + str(ind),
2330
+ )
2331
+ )
2332
+ ind += 1
2333
+ self.maybe_oversample_requests(
2334
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2335
+ )
2336
+ return sampled_requests
2337
+
2338
+
2339
+ class MultiModalConversationDataset(HuggingFaceDataset):
2340
+ """Dataset for multimodal conversation data."""
2341
+
2342
+ SUPPORTED_DATASET_PATHS = {
2343
+ "lmms-lab/LLaVA-OneVision-Data",
2344
+ }
2345
+ IS_MULTIMODAL = True
2346
+
2347
+ def sample(
2348
+ self,
2349
+ tokenizer: PreTrainedTokenizerBase,
2350
+ num_requests: int,
2351
+ output_len: int | None = None,
2352
+ enable_multimodal_chat: bool = False,
2353
+ request_id_prefix: str = "",
2354
+ no_oversample: bool = False,
2355
+ **kwargs,
2356
+ ) -> list:
2357
+ # Filter examples with at least 2 conversations
2358
+ filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
2359
+ sampled_requests = []
2360
+ ind = 0
2361
+ dynamic_output = output_len is None
2362
+
2363
+ for item in filtered_data:
2364
+ if len(sampled_requests) >= num_requests:
2365
+ break
2366
+ conv = item["conversations"]
2367
+ prompt, completion = conv[0]["value"], conv[1]["value"]
2368
+
2369
+ prompt_ids = tokenizer(prompt).input_ids
2370
+ completion_ids = tokenizer(completion).input_ids
2371
+ prompt_len = len(prompt_ids)
2372
+ completion_len = len(completion_ids)
2373
+ output_len = completion_len if dynamic_output else output_len
2374
+ assert isinstance(output_len, int) and output_len > 0
2375
+ if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
2376
+ continue
2377
+ mm_content = process_image(item["image"]) if "image" in item else None
2378
+ if enable_multimodal_chat:
2379
+ # Note: when chat is enabled the request prompt_len is no longer
2380
+ # accurate and we will be using request output to count the
2381
+ # actual prompt len and output len
2382
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2383
+ sampled_requests.append(
2384
+ SampleRequest(
2385
+ prompt=prompt,
2386
+ prompt_len=prompt_len,
2387
+ expected_output_len=output_len,
2388
+ multi_modal_data=mm_content,
2389
+ request_id=request_id_prefix + str(ind),
2390
+ )
2391
+ )
2392
+ ind += 1
2393
+ self.maybe_oversample_requests(
2394
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2395
+ )
2396
+ return sampled_requests
2397
+
2398
+
2399
+ # -----------------------------------------------------------------------------
2400
+ # Vision Arena Dataset Implementation
2401
+ # -----------------------------------------------------------------------------
2402
+
2403
+
2404
+ class VisionArenaDataset(HuggingFaceDataset):
2405
+ """
2406
+ Vision Arena Dataset.
2407
+ """
2408
+
2409
+ DEFAULT_OUTPUT_LEN = 128
2410
+ SUPPORTED_DATASET_PATHS = {
2411
+ "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
2412
+ "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
2413
+ }
2414
+ IS_MULTIMODAL = True
2415
+
2416
+ def sample(
2417
+ self,
2418
+ tokenizer: PreTrainedTokenizerBase,
2419
+ num_requests: int,
2420
+ output_len: int | None = None,
2421
+ enable_multimodal_chat: bool = False,
2422
+ request_id_prefix: str = "",
2423
+ no_oversample: bool = False,
2424
+ **kwargs,
2425
+ ) -> list:
2426
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2427
+ sampled_requests = []
2428
+ for i, item in enumerate(self.data):
2429
+ if len(sampled_requests) >= num_requests:
2430
+ break
2431
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2432
+ if parser_fn is None:
2433
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2434
+ prompt = parser_fn(item)
2435
+ mm_content = process_image(item["images"][0])
2436
+ prompt_len = len(tokenizer(prompt).input_ids)
2437
+ if enable_multimodal_chat:
2438
+ # Note: when chat is enabled the request prompt_len is no longer
2439
+ # accurate and we will be using request output to count the
2440
+ # actual prompt len
2441
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2442
+ sampled_requests.append(
2443
+ SampleRequest(
2444
+ prompt=prompt,
2445
+ prompt_len=prompt_len,
2446
+ expected_output_len=output_len,
2447
+ multi_modal_data=mm_content,
2448
+ request_id=request_id_prefix + str(i),
2449
+ )
2450
+ )
2451
+ self.maybe_oversample_requests(
2452
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2453
+ )
2454
+ return sampled_requests
2455
+
2456
+
2457
+ class MMVUDataset(HuggingFaceDataset):
2458
+ """
2459
+ MMVU Dataset.
2460
+ https://huggingface.co/datasets/yale-nlp/MMVU
2461
+ """
2462
+
2463
+ DEFAULT_OUTPUT_LEN = 128
2464
+ SUPPORTED_DATASET_PATHS = {
2465
+ "yale-nlp/MMVU": lambda x: x["question"]
2466
+ + " "
2467
+ + (" ".join(f"{k}.{v}" for k, v in x["choices"].items())),
2468
+ }
2469
+
2470
+ def sample(
2471
+ self,
2472
+ tokenizer: PreTrainedTokenizerBase,
2473
+ num_requests: int,
2474
+ output_len: int | None = None,
2475
+ enable_multimodal_chat: bool = False,
2476
+ request_id_prefix: str = "",
2477
+ no_oversample: bool = False,
2478
+ **kwargs,
2479
+ ) -> list:
2480
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2481
+ sampled_requests = []
2482
+ for i, item in enumerate(self.data):
2483
+ if len(sampled_requests) >= num_requests:
2484
+ break
2485
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2486
+ if parser_fn is None:
2487
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2488
+ prompt = parser_fn(item)
2489
+ mm_content = process_video(item["video"])
2490
+ prompt_len = len(tokenizer(prompt).input_ids)
2491
+ if enable_multimodal_chat:
2492
+ # Note: when chat is enabled the request prompt_len is no longer
2493
+ # accurate and we will be using request output to count the
2494
+ # actual prompt len
2495
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2496
+ sampled_requests.append(
2497
+ SampleRequest(
2498
+ prompt=prompt,
2499
+ prompt_len=prompt_len,
2500
+ expected_output_len=output_len,
2501
+ multi_modal_data=mm_content,
2502
+ request_id=request_id_prefix + str(i),
2503
+ )
2504
+ )
2505
+ self.maybe_oversample_requests(
2506
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2507
+ )
2508
+ return sampled_requests
2509
+
2510
+
2511
+ # -----------------------------------------------------------------------------
2512
+ # Instruct Coder Dataset Implementation
2513
+ # -----------------------------------------------------------------------------
2514
+
2515
+
2516
+ class InstructCoderDataset(HuggingFaceDataset):
2517
+ """
2518
+ InstructCoder Dataset.
2519
+ https://huggingface.co/datasets/likaixin/InstructCoder
2520
+
2521
+ InstructCoder is the dataset designed for general code editing. It consists
2522
+ of 114,239 instruction-input-output triplets, and covers multiple distinct
2523
+ code editing scenario.
2524
+ """
2525
+
2526
+ DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
2527
+ SUPPORTED_DATASET_PATHS = {
2528
+ "likaixin/InstructCoder",
2529
+ }
2530
+
2531
+ def sample(
2532
+ self,
2533
+ tokenizer: PreTrainedTokenizerBase,
2534
+ num_requests: int,
2535
+ output_len: int | None = None,
2536
+ enable_multimodal_chat: bool = False,
2537
+ skip_chat_template: bool = False,
2538
+ request_id_prefix: str = "",
2539
+ no_oversample: bool = False,
2540
+ **kwargs,
2541
+ ) -> list:
2542
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2543
+ sampled_requests = []
2544
+ for i, item in enumerate(self.data):
2545
+ if len(sampled_requests) >= num_requests:
2546
+ break
2547
+ prompt = (
2548
+ f"{item['input']}\n\n{item['instruction']} Just output "
2549
+ "the code, do not include any explanation."
2550
+ )
2551
+
2552
+ # apply template
2553
+ if not skip_chat_template:
2554
+ prompt = tokenizer.apply_chat_template(
2555
+ [{"role": "user", "content": prompt}],
2556
+ add_generation_prompt=True,
2557
+ tokenize=False,
2558
+ )
2559
+
2560
+ prompt_len = len(tokenizer(prompt).input_ids)
2561
+ sampled_requests.append(
2562
+ SampleRequest(
2563
+ prompt=prompt,
2564
+ prompt_len=prompt_len,
2565
+ expected_output_len=output_len,
2566
+ request_id=request_id_prefix + str(i),
2567
+ )
2568
+ )
2569
+ self.maybe_oversample_requests(
2570
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2571
+ )
2572
+ return sampled_requests
2573
+
2574
+
2575
+ # -----------------------------------------------------------------------------
2576
+ # MT-Bench Dataset Implementation
2577
+ # -----------------------------------------------------------------------------
2578
+
2579
+
2580
+ class MTBenchDataset(HuggingFaceDataset):
2581
+ """
2582
+ MT-Bench Dataset.
2583
+ https://huggingface.co/datasets/philschmid/mt-bench
2584
+
2585
+ We create a single turn dataset for MT-Bench.
2586
+ This is similar to Spec decoding benchmark setup in vLLM
2587
+ https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
2588
+ """ # noqa: E501
2589
+
2590
+ DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
2591
+ SUPPORTED_DATASET_PATHS = {
2592
+ "philschmid/mt-bench",
2593
+ }
2594
+
2595
+ def sample(
2596
+ self,
2597
+ tokenizer: PreTrainedTokenizerBase,
2598
+ num_requests: int,
2599
+ output_len: int | None = None,
2600
+ enable_multimodal_chat: bool = False,
2601
+ skip_chat_template: bool = False,
2602
+ request_id_prefix: str = "",
2603
+ no_oversample: bool = False,
2604
+ **kwargs,
2605
+ ) -> list:
2606
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2607
+ sampled_requests = []
2608
+
2609
+ for i, item in enumerate(self.data):
2610
+ if len(sampled_requests) >= num_requests:
2611
+ break
2612
+ prompt = item["turns"][0]
2613
+
2614
+ # apply template
2615
+ if not skip_chat_template:
2616
+ prompt = tokenizer.apply_chat_template(
2617
+ [{"role": "user", "content": prompt}],
2618
+ add_generation_prompt=True,
2619
+ tokenize=False,
2620
+ )
2621
+
2622
+ prompt_len = len(tokenizer(prompt).input_ids)
2623
+ sampled_requests.append(
2624
+ SampleRequest(
2625
+ prompt=prompt,
2626
+ prompt_len=prompt_len,
2627
+ expected_output_len=output_len,
2628
+ request_id=request_id_prefix + str(i),
2629
+ )
2630
+ )
2631
+ self.maybe_oversample_requests(
2632
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2633
+ )
2634
+ return sampled_requests
2635
+
2636
+
2637
+ # -----------------------------------------------------------------------------
2638
+ # Blazedit Dataset Implementation
2639
+ # -----------------------------------------------------------------------------
2640
+
2641
+
2642
+ class BlazeditDataset(HuggingFaceDataset):
2643
+ """
2644
+ Blazedit Dataset.
2645
+ https://github.com/ise-uiuc/blazedit
2646
+
2647
+ 5k char version: vdaita/edit_5k_char
2648
+ 10k char version: vdaita/edit_10k_char
2649
+ """ # noqa: E501
2650
+
2651
+ # 5k char version will have output as ~5k chars
2652
+ # 10k char version will have output as ~10k chars
2653
+ # Assuming 3 char per token, 10k chars will be 3333 tokens
2654
+ # We set default to 4000 to be safe
2655
+ DEFAULT_OUTPUT_LEN = 4000
2656
+ SUPPORTED_DATASET_PATHS = {
2657
+ "vdaita/edit_5k_char",
2658
+ "vdaita/edit_10k_char",
2659
+ }
2660
+
2661
+ def sample(
2662
+ self,
2663
+ tokenizer: PreTrainedTokenizerBase,
2664
+ num_requests: int,
2665
+ output_len: int | None = None,
2666
+ skip_chat_template: bool = False,
2667
+ request_id_prefix: str = "",
2668
+ no_oversample: bool = False,
2669
+ min_distance: float = 0.0,
2670
+ max_distance: float = 1.0,
2671
+ **kwargs,
2672
+ ) -> list:
2673
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2674
+ sampled_requests = []
2675
+
2676
+ for i, item in enumerate(self.data):
2677
+ if len(sampled_requests) >= num_requests:
2678
+ break
2679
+ code = item["code"]
2680
+ change_request = item["change_request"]
2681
+ norm_distance = item["norm_distance"]
2682
+
2683
+ # compare the levenshtein distance normalized by code length
2684
+ if norm_distance < min_distance or norm_distance > max_distance:
2685
+ continue
2686
+
2687
+ # template copied from
2688
+ # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
2689
+ prompt = f"""Given a code file, please apply the change requests and generate the new file.
2690
+
2691
+ Original file:
2692
+ ```python
2693
+ {code}
2694
+ ```
2695
+
2696
+ Change request:
2697
+ {change_request}
2698
+
2699
+ Please generate the new code file in the "New file" section below.""" # noqa: E501
2700
+
2701
+ # apply template
2702
+ if not skip_chat_template:
2703
+ prompt = tokenizer.apply_chat_template(
2704
+ [{"role": "user", "content": prompt}],
2705
+ add_generation_prompt=True,
2706
+ tokenize=False,
2707
+ )
2708
+
2709
+ prompt_len = len(tokenizer(prompt).input_ids)
2710
+
2711
+ sampled_requests.append(
2712
+ SampleRequest(
2713
+ prompt=prompt,
2714
+ prompt_len=prompt_len,
2715
+ expected_output_len=output_len,
2716
+ request_id=request_id_prefix + str(i),
2717
+ )
2718
+ )
2719
+ self.maybe_oversample_requests(
2720
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2721
+ )
2722
+
2723
+ return sampled_requests
2724
+
2725
+
2726
+ # -----------------------------------------------------------------------------
2727
+ # AIMO Dataset Implementation
2728
+ # -----------------------------------------------------------------------------
2729
+
2730
+
2731
+ class AIMODataset(HuggingFaceDataset):
2732
+ """
2733
+ Dataset class for processing a AIMO dataset with reasoning questions.
2734
+ """
2735
+
2736
+ SUPPORTED_DATASET_PATHS = {
2737
+ "AI-MO/aimo-validation-aime",
2738
+ "AI-MO/NuminaMath-1.5",
2739
+ "AI-MO/NuminaMath-CoT",
2740
+ }
2741
+
2742
+ def sample(
2743
+ self,
2744
+ tokenizer: PreTrainedTokenizerBase,
2745
+ num_requests: int,
2746
+ output_len: int | None = None,
2747
+ request_id_prefix: str = "",
2748
+ no_oversample: bool = False,
2749
+ **kwargs,
2750
+ ) -> list:
2751
+ sampled_requests = []
2752
+ ind = 0
2753
+ dynamic_output = output_len is None
2754
+
2755
+ for item in self.data:
2756
+ if len(sampled_requests) >= num_requests:
2757
+ break
2758
+ prompt, completion = item["problem"], item["solution"]
2759
+
2760
+ prompt_ids = tokenizer(prompt).input_ids
2761
+ completion_ids = tokenizer(completion).input_ids
2762
+ prompt_len = len(prompt_ids)
2763
+ completion_len = len(completion_ids)
2764
+ output_len = completion_len if dynamic_output else output_len
2765
+ assert isinstance(output_len, int) and output_len > 0
2766
+ if dynamic_output and not is_valid_sequence(
2767
+ prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
2768
+ ):
2769
+ continue
2770
+ sampled_requests.append(
2771
+ SampleRequest(
2772
+ prompt=prompt,
2773
+ prompt_len=prompt_len,
2774
+ expected_output_len=output_len,
2775
+ multi_modal_data=None,
2776
+ request_id=request_id_prefix + str(ind),
2777
+ )
2778
+ )
2779
+ ind += 1
2780
+ self.maybe_oversample_requests(
2781
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2782
+ )
2783
+ return sampled_requests
2784
+
2785
+
2786
+ # -----------------------------------------------------------------------------
2787
+ # Next Edit Prediction Dataset Implementation
2788
+ # -----------------------------------------------------------------------------
2789
+
2790
+
2791
+ zeta_prompt = """### Instruction:
2792
+ You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
2793
+
2794
+ ### User Edits:
2795
+
2796
+ {}
2797
+
2798
+ ### User Excerpt:
2799
+
2800
+ {}
2801
+
2802
+ ### Response:
2803
+
2804
+ """ # noqa: E501
2805
+
2806
+
2807
+ def _format_zeta_prompt(
2808
+ sample: dict, original_start_marker: str = "<|editable_region_start|>"
2809
+ ) -> dict:
2810
+ """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
2811
+
2812
+ This function formats examples from the NEP dataset
2813
+ into prompts and expected outputs. It could be
2814
+ further extended to support more NEP datasets.
2815
+
2816
+ Args:
2817
+ sample: The dataset sample containing events,
2818
+ inputs, and outputs.
2819
+ original_start_marker: The marker indicating the
2820
+ start of the editable region. Defaults to
2821
+ "<|editable_region_start|>".
2822
+
2823
+ Returns:
2824
+ A dictionary with the formatted prompts and expected outputs.
2825
+ """
2826
+ events = sample["events"]
2827
+ input = sample["input"]
2828
+ output = sample["output"]
2829
+ prompt = zeta_prompt.format(events, input)
2830
+
2831
+ # following the original implementation, extract the focused region
2832
+ # from the raw output
2833
+ output_start_index = output.find(original_start_marker)
2834
+ output_focused_region = output[output_start_index:]
2835
+ expected_output = output_focused_region
2836
+
2837
+ return {"prompt": prompt, "expected_output": expected_output}
2838
+
2839
+
2840
+ class NextEditPredictionDataset(HuggingFaceDataset):
2841
+ """
2842
+ Dataset class for processing a Next Edit Prediction dataset.
2843
+ """
2844
+
2845
+ SUPPORTED_DATASET_PATHS = {
2846
+ "zed-industries/zeta",
2847
+ }
2848
+ MAPPING_PROMPT_FUNCS = {
2849
+ "zed-industries/zeta": _format_zeta_prompt,
2850
+ }
2851
+
2852
+ def sample(
2853
+ self,
2854
+ tokenizer: PreTrainedTokenizerBase,
2855
+ num_requests: int,
2856
+ request_id_prefix: str = "",
2857
+ no_oversample: bool = False,
2858
+ **kwargs,
2859
+ ):
2860
+ formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
2861
+ if formatting_prompt_func is None:
2862
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2863
+ samples = []
2864
+ for i, sample in enumerate(self.data):
2865
+ sample = formatting_prompt_func(sample)
2866
+ samples.append(
2867
+ SampleRequest(
2868
+ prompt=sample["prompt"],
2869
+ prompt_len=len(tokenizer(sample["prompt"]).input_ids),
2870
+ expected_output_len=len(
2871
+ tokenizer(sample["expected_output"]).input_ids
2872
+ ),
2873
+ request_id=request_id_prefix + str(i),
2874
+ )
2875
+ )
2876
+ if len(samples) >= num_requests:
2877
+ break
2878
+ self.maybe_oversample_requests(
2879
+ samples, num_requests, request_id_prefix, no_oversample
2880
+ )
2881
+ return samples
2882
+
2883
+
2884
+ # -----------------------------------------------------------------------------
2885
+ # ASR Dataset Implementation
2886
+ # -----------------------------------------------------------------------------
2887
+
2888
+
2889
+ class ASRDataset(HuggingFaceDataset):
2890
+ """
2891
+ Dataset class for processing a ASR dataset for transcription.
2892
+ Tested on the following set:
2893
+
2894
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2895
+ | Dataset | Domain | Speaking Style | hf-subset |
2896
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2897
+ | TED-LIUM | TED talks | Oratory | release1, release2, release3|
2898
+ | | | | release3-speaker-adaptation |
2899
+ | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
2900
+ | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
2901
+ | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
2902
+ | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
2903
+ | AMI | Meetings | Spontaneous | ihm, sdm |
2904
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2905
+
2906
+ """ # noqa: E501
2907
+
2908
+ SUPPORTED_DATASET_PATHS = {
2909
+ "openslr/librispeech_asr",
2910
+ "facebook/voxpopuli",
2911
+ "LIUM/tedlium",
2912
+ "edinburghcstr/ami",
2913
+ "speechcolab/gigaspeech",
2914
+ "kensho/spgispeech",
2915
+ }
2916
+
2917
+ DEFAULT_OUTPUT_LEN = 128
2918
+ IS_MULTIMODAL = True
2919
+
2920
+ # TODO Whisper-specific. Abstract interface when more models are supported.
2921
+ TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
2922
+ skip_long_audios: bool = True
2923
+
2924
+ def sample(
2925
+ self,
2926
+ tokenizer: PreTrainedTokenizerBase,
2927
+ num_requests: int,
2928
+ output_len: int | None = None,
2929
+ request_id_prefix: str = "",
2930
+ no_oversample: bool = False,
2931
+ **kwargs,
2932
+ ) -> list:
2933
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2934
+ prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
2935
+ prompt_len = len(tokenizer(prompt).input_ids)
2936
+ sampled_requests = []
2937
+ ind = 0
2938
+ skipped = 0
2939
+ for item in self.data:
2940
+ if len(sampled_requests) >= num_requests:
2941
+ break
2942
+ audio = item["audio"]
2943
+ y, sr = audio["array"], audio["sampling_rate"]
2944
+ duration_s = librosa.get_duration(y=y, sr=sr)
2945
+ # Whisper max supported duration
2946
+ if self.skip_long_audios and duration_s > 30:
2947
+ skipped += 1
2948
+ continue
2949
+
2950
+ mm_content = {"audio": (y, sr)}
2951
+ sampled_requests.append(
2952
+ SampleRequest(
2953
+ prompt=prompt,
2954
+ prompt_len=prompt_len,
2955
+ expected_output_len=output_len,
2956
+ multi_modal_data=mm_content,
2957
+ request_id=request_id_prefix + str(ind),
2958
+ )
2959
+ )
2960
+ ind += 1
2961
+ if skipped:
2962
+ logger.warning(
2963
+ "%d samples discarded from dataset due to"
2964
+ " their length being greater than"
2965
+ " what Whisper supports.",
2966
+ skipped,
2967
+ )
2968
+ self.maybe_oversample_requests(
2969
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2970
+ )
2971
+ return sampled_requests
2972
+
2973
+
2974
+ # -----------------------------------------------------------------------------
2975
+ # MLPerf Dataset Implementation
2976
+ # -----------------------------------------------------------------------------
2977
+
2978
+
2979
+ class MLPerfDataset(HuggingFaceDataset):
2980
+ """
2981
+ MLPerf Inference Dataset.
2982
+
2983
+ Dataset on HF:
2984
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data
2985
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data
2986
+
2987
+ Each record contains:
2988
+ - "system_prompt": system role instruction.
2989
+ - "question": user question.
2990
+ - "output": reference answer.
2991
+
2992
+ We combine the system prompt and question into a chat-formatted prompt
2993
+ (using the tokenizer's chat template) and set the expected output length to
2994
+ the tokenized length of the provided reference answer.
2995
+ """
2996
+
2997
+ SUPPORTED_DATASET_PATHS = {
2998
+ "mgoin/mlperf-inference-llama2-data",
2999
+ "mgoin/mlperf-inference-llama3.1-data",
3000
+ }
3001
+
3002
+ def sample(
3003
+ self,
3004
+ tokenizer: PreTrainedTokenizerBase,
3005
+ num_requests: int,
3006
+ output_len: int | None = None,
3007
+ request_id_prefix: str = "",
3008
+ no_oversample: bool = False,
3009
+ **kwargs,
3010
+ ) -> list[SampleRequest]:
3011
+ # Force dynamic output length based on reference completion.
3012
+ dynamic_output = output_len is None
3013
+ sampled_requests: list[SampleRequest] = []
3014
+ ind = 0
3015
+
3016
+ for item in self.data:
3017
+ if len(sampled_requests) >= num_requests:
3018
+ break
3019
+
3020
+ system_prompt = item["system_prompt"]
3021
+ question = item["question"]
3022
+ reference_answer = item["output"]
3023
+
3024
+ # Build chat-style prompt using tokenizer template, if available.
3025
+ messages = [
3026
+ {"role": "system", "content": system_prompt},
3027
+ {"role": "user", "content": question},
3028
+ ]
3029
+ prompt_formatted = tokenizer.apply_chat_template(
3030
+ messages, add_generation_prompt=True, tokenize=False
3031
+ )
3032
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
3033
+
3034
+ # Determine output length from reference answer tokens.
3035
+ ref_out_len = len(
3036
+ tokenizer(reference_answer, add_special_tokens=False).input_ids
3037
+ )
3038
+ expected_output_len = ref_out_len if dynamic_output else output_len
3039
+
3040
+ # Validate sequence lengths.
3041
+ if not is_valid_sequence(prompt_len, expected_output_len):
3042
+ continue
3043
+
3044
+ sampled_requests.append(
3045
+ SampleRequest(
3046
+ prompt=prompt_formatted,
3047
+ prompt_len=prompt_len,
3048
+ expected_output_len=expected_output_len,
3049
+ request_id=request_id_prefix + str(ind),
3050
+ )
3051
+ )
3052
+ ind += 1
3053
+
3054
+ self.maybe_oversample_requests(
3055
+ sampled_requests, num_requests, request_id_prefix, no_oversample
3056
+ )
3057
+ return sampled_requests
3058
+
3059
+
3060
+ # -----------------------------------------------------------------------------
3061
+ # Prefix Repetition Dataset Implementation
3062
+ # -----------------------------------------------------------------------------
3063
+
3064
+
3065
+ class PrefixRepetitionRandomDataset(BenchmarkDataset):
3066
+ # Default values copied from benchmark_serving.py for the repeated prefix
3067
+ # dataset.
3068
+ DEFAULT_PREFIX_LEN = 256
3069
+ DEFAULT_SUFFIX_LEN = 256
3070
+ DEFAULT_NUM_PREFIXES = 10
3071
+ DEFAULT_OUTPUT_LEN = 128
3072
+
3073
+ def __init__(
3074
+ self,
3075
+ **kwargs,
3076
+ ) -> None:
3077
+ super().__init__(**kwargs)
3078
+ random.seed(self.random_seed)
3079
+ np.random.seed(self.random_seed)
3080
+
3081
+ def sample(
3082
+ self,
3083
+ tokenizer: PreTrainedTokenizerBase,
3084
+ num_requests: int,
3085
+ prefix_len: int = DEFAULT_PREFIX_LEN,
3086
+ suffix_len: int = DEFAULT_SUFFIX_LEN,
3087
+ num_prefixes: int = DEFAULT_NUM_PREFIXES,
3088
+ output_len: int = DEFAULT_OUTPUT_LEN,
3089
+ request_id_prefix: str = "",
3090
+ no_oversample: bool = False,
3091
+ **kwargs,
3092
+ ) -> list[SampleRequest]:
3093
+ vocab_size = tokenizer.vocab_size
3094
+ prompts_per_prefix = num_requests // num_prefixes
3095
+ if prompts_per_prefix == 0:
3096
+ raise ValueError(
3097
+ f"num_requests ({num_requests}) must be greater than or equal "
3098
+ f"to num_prefixes ({num_prefixes})"
3099
+ )
3100
+
3101
+ def _generate_exact_length_tokens(target_length: int) -> list[int]:
3102
+ """Generate tokens that decode and re-encode to exactly
3103
+ target_length."""
3104
+ # Generate random tokens
3105
+ tokens = np.random.randint(0, vocab_size, size=target_length).tolist()
3106
+
3107
+ _, adjusted_tokens, token_mismatch = gen_prompt_decode_to_target_len( # noqa: E501
3108
+ tokenizer=tokenizer,
3109
+ token_sequence=tokens,
3110
+ target_token_len=target_length,
3111
+ add_special_tokens=False,
3112
+ )
3113
+ return adjusted_tokens, token_mismatch
3114
+
3115
+ requests = []
3116
+ token_mismatch_total = 0
3117
+ for _ in range(num_prefixes):
3118
+ prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len)
3119
+ token_mismatch_total += prefix_mismatch
3120
+
3121
+ for _ in range(prompts_per_prefix):
3122
+ suffix_tokens, suffix_mismatch = _generate_exact_length_tokens(
3123
+ suffix_len
3124
+ )
3125
+ token_mismatch_total += suffix_mismatch
3126
+ combined_tokens = prefix_tokens + suffix_tokens
3127
+ prompt = tokenizer.decode(combined_tokens)
3128
+ prompt_len = len(combined_tokens)
3129
+ requests.append(
3130
+ SampleRequest(
3131
+ prompt=prompt,
3132
+ prompt_len=prompt_len,
3133
+ expected_output_len=output_len,
3134
+ )
3135
+ )
3136
+
3137
+ if token_mismatch_total != 0:
3138
+ sign = "more" if token_mismatch_total > 0 else "fewer"
3139
+ logger.warning(
3140
+ "Across all generated prompts, there were %d %s tokens "
3141
+ "than expected after decoding and re-encoding. This is "
3142
+ "expected due to the imperfect nature of the sampling "
3143
+ "procedure.",
3144
+ abs(token_mismatch_total),
3145
+ sign,
3146
+ )
3147
+ if not getattr(self, "disable_shuffle", False):
3148
+ random.shuffle(requests)
3149
+ return requests
3150
+
3151
+
3152
+ # -----------------------------------------------------------------------------
3153
+ # MMStar Dataset Implementation
3154
+ # -----------------------------------------------------------------------------
3155
+
3156
+
3157
+ class MMStarDataset(HuggingFaceDataset):
3158
+ """
3159
+ Lin-Chen/MMStar: https://huggingface.co/datasets/Lin-Chen/MMStar
3160
+ refer to: https://github.com/sgl-project/SpecForge/pull/106
3161
+ """
3162
+
3163
+ DEFAULT_OUTPUT_LEN = 128
3164
+ SUPPORTED_DATASET_PATHS = {"Lin-Chen/MMStar"}
3165
+ IS_MULTIMODAL = True
3166
+
3167
+ def sample(
3168
+ self,
3169
+ tokenizer: PreTrainedTokenizerBase,
3170
+ num_requests: int,
3171
+ output_len: int | None = None,
3172
+ enable_multimodal_chat: bool = False,
3173
+ request_id_prefix: str = "",
3174
+ no_oversample: bool = False,
3175
+ **kwargs,
3176
+ ) -> list[SampleRequest]:
3177
+ # If --hf-output-len is not set, use the default output length.
3178
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
3179
+ sampled_requests: list[SampleRequest] = []
3180
+
3181
+ for ind, item in enumerate(self.data):
3182
+ if len(sampled_requests) >= num_requests:
3183
+ break
3184
+ # Split the question text from options
3185
+ # (keep only the part before "Options:").
3186
+ full_q: str = item.get("question", "")
3187
+ question_text = full_q.split("Options:", 1)[0].strip()
3188
+
3189
+ # Multimodal image content.
3190
+ mm_content = process_image(item["image"])
3191
+
3192
+ # Compute prompt token length (note: this is plain text length
3193
+ # if enable_multimodal_chat is False).
3194
+ prompt_len = len(tokenizer(question_text).input_ids)
3195
+
3196
+ if enable_multimodal_chat:
3197
+ # If multimodal content should be embedded in the chat message,
3198
+ # convert to [{"role":"user","content":[...]}]
3199
+ prompt = self.apply_multimodal_chat_transformation(
3200
+ question_text, mm_content
3201
+ )
3202
+ mm_for_request = None # Already embedded in chat content.
3203
+ else:
3204
+ # Default: prompt is plain text,
3205
+ # image is in mm_content for the bench to assemble.
3206
+ prompt = question_text
3207
+ mm_for_request = mm_content
3208
+
3209
+ sampled_requests.append(
3210
+ SampleRequest(
3211
+ prompt=prompt,
3212
+ prompt_len=prompt_len,
3213
+ expected_output_len=output_len,
3214
+ multi_modal_data=mm_for_request,
3215
+ request_id=request_id_prefix + str(ind),
3216
+ )
3217
+ )
3218
+
3219
+ self.maybe_oversample_requests(
3220
+ sampled_requests, num_requests, request_id_prefix, no_oversample
3221
+ )
3222
+ return sampled_requests