vllm-cpu-avx512vnni 0.13.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1641) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +1260 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +3080 -0
  6. vllm/_ipex_ops.py +457 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +59 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/backends/__init__.py +0 -0
  15. vllm/attention/backends/abstract.py +443 -0
  16. vllm/attention/backends/registry.py +254 -0
  17. vllm/attention/backends/utils.py +33 -0
  18. vllm/attention/layer.py +969 -0
  19. vllm/attention/layers/__init__.py +0 -0
  20. vllm/attention/layers/chunked_local_attention.py +120 -0
  21. vllm/attention/layers/cross_attention.py +178 -0
  22. vllm/attention/layers/encoder_only_attention.py +103 -0
  23. vllm/attention/layers/mm_encoder_attention.py +284 -0
  24. vllm/attention/ops/__init__.py +0 -0
  25. vllm/attention/ops/chunked_prefill_paged_decode.py +401 -0
  26. vllm/attention/ops/common.py +469 -0
  27. vllm/attention/ops/flashmla.py +251 -0
  28. vllm/attention/ops/merge_attn_states.py +47 -0
  29. vllm/attention/ops/paged_attn.py +51 -0
  30. vllm/attention/ops/pallas_kv_cache_update.py +130 -0
  31. vllm/attention/ops/prefix_prefill.py +814 -0
  32. vllm/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  33. vllm/attention/ops/triton_decode_attention.py +712 -0
  34. vllm/attention/ops/triton_merge_attn_states.py +116 -0
  35. vllm/attention/ops/triton_reshape_and_cache_flash.py +184 -0
  36. vllm/attention/ops/triton_unified_attention.py +1047 -0
  37. vllm/attention/ops/vit_attn_wrappers.py +139 -0
  38. vllm/attention/selector.py +145 -0
  39. vllm/attention/utils/__init__.py +0 -0
  40. vllm/attention/utils/fa_utils.py +118 -0
  41. vllm/attention/utils/kv_sharing_utils.py +33 -0
  42. vllm/attention/utils/kv_transfer_utils.py +60 -0
  43. vllm/beam_search.py +88 -0
  44. vllm/benchmarks/__init__.py +0 -0
  45. vllm/benchmarks/datasets.py +3228 -0
  46. vllm/benchmarks/latency.py +170 -0
  47. vllm/benchmarks/lib/__init__.py +3 -0
  48. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  49. vllm/benchmarks/lib/ready_checker.py +72 -0
  50. vllm/benchmarks/lib/utils.py +79 -0
  51. vllm/benchmarks/serve.py +1538 -0
  52. vllm/benchmarks/startup.py +326 -0
  53. vllm/benchmarks/sweep/__init__.py +0 -0
  54. vllm/benchmarks/sweep/cli.py +41 -0
  55. vllm/benchmarks/sweep/param_sweep.py +158 -0
  56. vllm/benchmarks/sweep/plot.py +675 -0
  57. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  58. vllm/benchmarks/sweep/serve.py +450 -0
  59. vllm/benchmarks/sweep/serve_sla.py +492 -0
  60. vllm/benchmarks/sweep/server.py +114 -0
  61. vllm/benchmarks/sweep/sla_sweep.py +132 -0
  62. vllm/benchmarks/sweep/utils.py +4 -0
  63. vllm/benchmarks/throughput.py +808 -0
  64. vllm/collect_env.py +857 -0
  65. vllm/compilation/__init__.py +0 -0
  66. vllm/compilation/activation_quant_fusion.py +209 -0
  67. vllm/compilation/backends.py +839 -0
  68. vllm/compilation/base_static_graph.py +57 -0
  69. vllm/compilation/caching.py +180 -0
  70. vllm/compilation/collective_fusion.py +1215 -0
  71. vllm/compilation/compiler_interface.py +639 -0
  72. vllm/compilation/counter.py +48 -0
  73. vllm/compilation/cuda_graph.py +302 -0
  74. vllm/compilation/decorators.py +626 -0
  75. vllm/compilation/fix_functionalization.py +266 -0
  76. vllm/compilation/fusion.py +550 -0
  77. vllm/compilation/fusion_attn.py +359 -0
  78. vllm/compilation/fx_utils.py +91 -0
  79. vllm/compilation/inductor_pass.py +138 -0
  80. vllm/compilation/matcher_utils.py +361 -0
  81. vllm/compilation/monitor.py +62 -0
  82. vllm/compilation/noop_elimination.py +130 -0
  83. vllm/compilation/partition_rules.py +72 -0
  84. vllm/compilation/pass_manager.py +155 -0
  85. vllm/compilation/piecewise_backend.py +178 -0
  86. vllm/compilation/post_cleanup.py +21 -0
  87. vllm/compilation/qk_norm_rope_fusion.py +238 -0
  88. vllm/compilation/rocm_aiter_fusion.py +242 -0
  89. vllm/compilation/sequence_parallelism.py +364 -0
  90. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  91. vllm/compilation/vllm_inductor_pass.py +173 -0
  92. vllm/compilation/wrapper.py +319 -0
  93. vllm/config/__init__.py +108 -0
  94. vllm/config/attention.py +114 -0
  95. vllm/config/cache.py +232 -0
  96. vllm/config/compilation.py +1140 -0
  97. vllm/config/device.py +75 -0
  98. vllm/config/ec_transfer.py +110 -0
  99. vllm/config/kv_events.py +56 -0
  100. vllm/config/kv_transfer.py +119 -0
  101. vllm/config/load.py +124 -0
  102. vllm/config/lora.py +96 -0
  103. vllm/config/model.py +2190 -0
  104. vllm/config/multimodal.py +247 -0
  105. vllm/config/observability.py +140 -0
  106. vllm/config/parallel.py +660 -0
  107. vllm/config/pooler.py +126 -0
  108. vllm/config/profiler.py +199 -0
  109. vllm/config/scheduler.py +299 -0
  110. vllm/config/speculative.py +644 -0
  111. vllm/config/speech_to_text.py +38 -0
  112. vllm/config/structured_outputs.py +78 -0
  113. vllm/config/utils.py +370 -0
  114. vllm/config/vllm.py +1434 -0
  115. vllm/connections.py +189 -0
  116. vllm/device_allocator/__init__.py +0 -0
  117. vllm/device_allocator/cumem.py +327 -0
  118. vllm/distributed/__init__.py +6 -0
  119. vllm/distributed/communication_op.py +43 -0
  120. vllm/distributed/device_communicators/__init__.py +0 -0
  121. vllm/distributed/device_communicators/all2all.py +490 -0
  122. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  123. vllm/distributed/device_communicators/base_device_communicator.py +297 -0
  124. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  125. vllm/distributed/device_communicators/cuda_communicator.py +340 -0
  126. vllm/distributed/device_communicators/cuda_wrapper.py +216 -0
  127. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  128. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  129. vllm/distributed/device_communicators/pynccl.py +386 -0
  130. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  131. vllm/distributed/device_communicators/pynccl_wrapper.py +564 -0
  132. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  133. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  134. vllm/distributed/device_communicators/shm_broadcast.py +778 -0
  135. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  136. vllm/distributed/device_communicators/symm_mem.py +156 -0
  137. vllm/distributed/device_communicators/tpu_communicator.py +99 -0
  138. vllm/distributed/device_communicators/xpu_communicator.py +95 -0
  139. vllm/distributed/ec_transfer/__init__.py +14 -0
  140. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  141. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  142. vllm/distributed/ec_transfer/ec_connector/example_connector.py +201 -0
  143. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  144. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  145. vllm/distributed/eplb/__init__.py +3 -0
  146. vllm/distributed/eplb/async_worker.py +115 -0
  147. vllm/distributed/eplb/eplb_state.py +1164 -0
  148. vllm/distributed/eplb/policy/__init__.py +19 -0
  149. vllm/distributed/eplb/policy/abstract.py +40 -0
  150. vllm/distributed/eplb/policy/default.py +267 -0
  151. vllm/distributed/eplb/rebalance_execute.py +529 -0
  152. vllm/distributed/kv_events.py +499 -0
  153. vllm/distributed/kv_transfer/README.md +29 -0
  154. vllm/distributed/kv_transfer/__init__.py +20 -0
  155. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  156. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  157. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  158. vllm/distributed/kv_transfer/kv_connector/factory.py +197 -0
  159. vllm/distributed/kv_transfer/kv_connector/utils.py +322 -0
  160. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  161. vllm/distributed/kv_transfer/kv_connector/v1/base.py +597 -0
  162. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  163. vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +450 -0
  164. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +327 -0
  165. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  166. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +378 -0
  167. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +221 -0
  168. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1418 -0
  169. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +895 -0
  170. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +186 -0
  171. vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +914 -0
  172. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +464 -0
  173. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2526 -0
  174. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +538 -0
  175. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  176. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  177. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  178. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  179. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  180. vllm/distributed/parallel_state.py +1795 -0
  181. vllm/distributed/tpu_distributed_utils.py +188 -0
  182. vllm/distributed/utils.py +545 -0
  183. vllm/engine/__init__.py +0 -0
  184. vllm/engine/arg_utils.py +2068 -0
  185. vllm/engine/async_llm_engine.py +6 -0
  186. vllm/engine/llm_engine.py +6 -0
  187. vllm/engine/protocol.py +190 -0
  188. vllm/entrypoints/__init__.py +0 -0
  189. vllm/entrypoints/anthropic/__init__.py +0 -0
  190. vllm/entrypoints/anthropic/protocol.py +162 -0
  191. vllm/entrypoints/anthropic/serving_messages.py +468 -0
  192. vllm/entrypoints/api_server.py +185 -0
  193. vllm/entrypoints/chat_utils.py +1903 -0
  194. vllm/entrypoints/cli/__init__.py +15 -0
  195. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  196. vllm/entrypoints/cli/benchmark/base.py +25 -0
  197. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  198. vllm/entrypoints/cli/benchmark/main.py +56 -0
  199. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  200. vllm/entrypoints/cli/benchmark/startup.py +21 -0
  201. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  202. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  203. vllm/entrypoints/cli/collect_env.py +38 -0
  204. vllm/entrypoints/cli/main.py +79 -0
  205. vllm/entrypoints/cli/openai.py +260 -0
  206. vllm/entrypoints/cli/run_batch.py +68 -0
  207. vllm/entrypoints/cli/serve.py +249 -0
  208. vllm/entrypoints/cli/types.py +29 -0
  209. vllm/entrypoints/constants.py +12 -0
  210. vllm/entrypoints/context.py +835 -0
  211. vllm/entrypoints/launcher.py +175 -0
  212. vllm/entrypoints/llm.py +1790 -0
  213. vllm/entrypoints/logger.py +84 -0
  214. vllm/entrypoints/openai/__init__.py +0 -0
  215. vllm/entrypoints/openai/api_server.py +1469 -0
  216. vllm/entrypoints/openai/cli_args.py +302 -0
  217. vllm/entrypoints/openai/orca_metrics.py +120 -0
  218. vllm/entrypoints/openai/parser/__init__.py +0 -0
  219. vllm/entrypoints/openai/parser/harmony_utils.py +825 -0
  220. vllm/entrypoints/openai/parser/responses_parser.py +135 -0
  221. vllm/entrypoints/openai/protocol.py +2496 -0
  222. vllm/entrypoints/openai/run_batch.py +631 -0
  223. vllm/entrypoints/openai/serving_chat.py +1822 -0
  224. vllm/entrypoints/openai/serving_completion.py +729 -0
  225. vllm/entrypoints/openai/serving_engine.py +1542 -0
  226. vllm/entrypoints/openai/serving_models.py +304 -0
  227. vllm/entrypoints/openai/serving_responses.py +2080 -0
  228. vllm/entrypoints/openai/serving_transcription.py +168 -0
  229. vllm/entrypoints/openai/speech_to_text.py +559 -0
  230. vllm/entrypoints/openai/tool_parsers/__init__.py +33 -0
  231. vllm/entrypoints/openai/utils.py +49 -0
  232. vllm/entrypoints/pooling/__init__.py +16 -0
  233. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  234. vllm/entrypoints/pooling/classify/api_router.py +50 -0
  235. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  236. vllm/entrypoints/pooling/classify/serving.py +233 -0
  237. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  238. vllm/entrypoints/pooling/embed/api_router.py +67 -0
  239. vllm/entrypoints/pooling/embed/protocol.py +208 -0
  240. vllm/entrypoints/pooling/embed/serving.py +684 -0
  241. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  242. vllm/entrypoints/pooling/pooling/api_router.py +63 -0
  243. vllm/entrypoints/pooling/pooling/protocol.py +148 -0
  244. vllm/entrypoints/pooling/pooling/serving.py +354 -0
  245. vllm/entrypoints/pooling/score/__init__.py +0 -0
  246. vllm/entrypoints/pooling/score/api_router.py +149 -0
  247. vllm/entrypoints/pooling/score/protocol.py +146 -0
  248. vllm/entrypoints/pooling/score/serving.py +508 -0
  249. vllm/entrypoints/renderer.py +410 -0
  250. vllm/entrypoints/responses_utils.py +249 -0
  251. vllm/entrypoints/sagemaker/__init__.py +4 -0
  252. vllm/entrypoints/sagemaker/routes.py +118 -0
  253. vllm/entrypoints/score_utils.py +237 -0
  254. vllm/entrypoints/serve/__init__.py +60 -0
  255. vllm/entrypoints/serve/disagg/__init__.py +0 -0
  256. vllm/entrypoints/serve/disagg/api_router.py +110 -0
  257. vllm/entrypoints/serve/disagg/protocol.py +90 -0
  258. vllm/entrypoints/serve/disagg/serving.py +285 -0
  259. vllm/entrypoints/serve/elastic_ep/__init__.py +0 -0
  260. vllm/entrypoints/serve/elastic_ep/api_router.py +96 -0
  261. vllm/entrypoints/serve/elastic_ep/middleware.py +49 -0
  262. vllm/entrypoints/serve/instrumentator/__init__.py +0 -0
  263. vllm/entrypoints/serve/instrumentator/health.py +33 -0
  264. vllm/entrypoints/serve/instrumentator/metrics.py +45 -0
  265. vllm/entrypoints/serve/lora/__init__.py +0 -0
  266. vllm/entrypoints/serve/lora/api_router.py +70 -0
  267. vllm/entrypoints/serve/profile/__init__.py +0 -0
  268. vllm/entrypoints/serve/profile/api_router.py +46 -0
  269. vllm/entrypoints/serve/rlhf/__init__.py +0 -0
  270. vllm/entrypoints/serve/rlhf/api_router.py +102 -0
  271. vllm/entrypoints/serve/sleep/__init__.py +0 -0
  272. vllm/entrypoints/serve/sleep/api_router.py +60 -0
  273. vllm/entrypoints/serve/tokenize/__init__.py +0 -0
  274. vllm/entrypoints/serve/tokenize/api_router.py +118 -0
  275. vllm/entrypoints/serve/tokenize/serving.py +204 -0
  276. vllm/entrypoints/ssl.py +78 -0
  277. vllm/entrypoints/tool.py +187 -0
  278. vllm/entrypoints/tool_server.py +234 -0
  279. vllm/entrypoints/utils.py +319 -0
  280. vllm/env_override.py +378 -0
  281. vllm/envs.py +1744 -0
  282. vllm/forward_context.py +358 -0
  283. vllm/inputs/__init__.py +44 -0
  284. vllm/inputs/data.py +359 -0
  285. vllm/inputs/parse.py +146 -0
  286. vllm/inputs/preprocess.py +717 -0
  287. vllm/logger.py +303 -0
  288. vllm/logging_utils/__init__.py +13 -0
  289. vllm/logging_utils/dump_input.py +83 -0
  290. vllm/logging_utils/formatter.py +127 -0
  291. vllm/logging_utils/lazy.py +20 -0
  292. vllm/logging_utils/log_time.py +34 -0
  293. vllm/logits_process.py +121 -0
  294. vllm/logprobs.py +206 -0
  295. vllm/lora/__init__.py +0 -0
  296. vllm/lora/layers/__init__.py +42 -0
  297. vllm/lora/layers/base.py +66 -0
  298. vllm/lora/layers/base_linear.py +165 -0
  299. vllm/lora/layers/column_parallel_linear.py +577 -0
  300. vllm/lora/layers/fused_moe.py +747 -0
  301. vllm/lora/layers/logits_processor.py +203 -0
  302. vllm/lora/layers/replicated_linear.py +70 -0
  303. vllm/lora/layers/row_parallel_linear.py +176 -0
  304. vllm/lora/layers/utils.py +74 -0
  305. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  306. vllm/lora/lora_model.py +246 -0
  307. vllm/lora/lora_weights.py +227 -0
  308. vllm/lora/model_manager.py +690 -0
  309. vllm/lora/ops/__init__.py +0 -0
  310. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  311. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  312. vllm/lora/ops/torch_ops/__init__.py +20 -0
  313. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  314. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  315. vllm/lora/ops/triton_ops/__init__.py +21 -0
  316. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +665 -0
  317. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  318. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  319. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  320. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  321. vllm/lora/ops/triton_ops/utils.py +295 -0
  322. vllm/lora/ops/xla_ops/__init__.py +6 -0
  323. vllm/lora/ops/xla_ops/lora_ops.py +141 -0
  324. vllm/lora/peft_helper.py +128 -0
  325. vllm/lora/punica_wrapper/__init__.py +10 -0
  326. vllm/lora/punica_wrapper/punica_base.py +493 -0
  327. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  328. vllm/lora/punica_wrapper/punica_gpu.py +412 -0
  329. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  330. vllm/lora/punica_wrapper/punica_tpu.py +358 -0
  331. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  332. vllm/lora/punica_wrapper/utils.py +150 -0
  333. vllm/lora/request.py +100 -0
  334. vllm/lora/resolver.py +88 -0
  335. vllm/lora/utils.py +315 -0
  336. vllm/lora/worker_manager.py +268 -0
  337. vllm/model_executor/__init__.py +11 -0
  338. vllm/model_executor/custom_op.py +199 -0
  339. vllm/model_executor/layers/__init__.py +0 -0
  340. vllm/model_executor/layers/activation.py +595 -0
  341. vllm/model_executor/layers/attention_layer_base.py +32 -0
  342. vllm/model_executor/layers/batch_invariant.py +1067 -0
  343. vllm/model_executor/layers/conv.py +256 -0
  344. vllm/model_executor/layers/fla/__init__.py +8 -0
  345. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  346. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  347. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  348. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  349. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  350. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  351. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  352. vllm/model_executor/layers/fla/ops/index.py +41 -0
  353. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  354. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  355. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  356. vllm/model_executor/layers/fla/ops/op.py +60 -0
  357. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  358. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  359. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  360. vllm/model_executor/layers/fused_moe/__init__.py +114 -0
  361. vllm/model_executor/layers/fused_moe/all2all_utils.py +171 -0
  362. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +409 -0
  363. vllm/model_executor/layers/fused_moe/config.py +1043 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  638. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  639. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +292 -0
  640. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1453 -0
  641. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +358 -0
  642. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +427 -0
  643. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  644. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +434 -0
  645. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +376 -0
  646. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +307 -0
  647. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +362 -0
  648. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  649. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1012 -0
  650. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +825 -0
  651. vllm/model_executor/layers/fused_moe/fused_moe.py +2223 -0
  652. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +103 -0
  653. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +119 -0
  654. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +524 -0
  655. vllm/model_executor/layers/fused_moe/layer.py +2133 -0
  656. vllm/model_executor/layers/fused_moe/modular_kernel.py +1302 -0
  657. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +192 -0
  658. vllm/model_executor/layers/fused_moe/moe_pallas.py +83 -0
  659. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  660. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  661. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  662. vllm/model_executor/layers/fused_moe/prepare_finalize.py +78 -0
  663. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +265 -0
  664. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  665. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  666. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  667. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +163 -0
  668. vllm/model_executor/layers/fused_moe/trtllm_moe.py +143 -0
  669. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +455 -0
  670. vllm/model_executor/layers/fused_moe/utils.py +332 -0
  671. vllm/model_executor/layers/kda.py +442 -0
  672. vllm/model_executor/layers/layernorm.py +442 -0
  673. vllm/model_executor/layers/lightning_attn.py +735 -0
  674. vllm/model_executor/layers/linear.py +1424 -0
  675. vllm/model_executor/layers/logits_processor.py +106 -0
  676. vllm/model_executor/layers/mamba/__init__.py +0 -0
  677. vllm/model_executor/layers/mamba/abstract.py +68 -0
  678. vllm/model_executor/layers/mamba/linear_attn.py +388 -0
  679. vllm/model_executor/layers/mamba/mamba_mixer.py +526 -0
  680. vllm/model_executor/layers/mamba/mamba_mixer2.py +930 -0
  681. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  682. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  683. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  684. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  685. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +586 -0
  686. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  687. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  688. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  689. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  690. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  691. vllm/model_executor/layers/mamba/short_conv.py +255 -0
  692. vllm/model_executor/layers/mla.py +176 -0
  693. vllm/model_executor/layers/pooler.py +830 -0
  694. vllm/model_executor/layers/quantization/__init__.py +179 -0
  695. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  696. vllm/model_executor/layers/quantization/awq.py +277 -0
  697. vllm/model_executor/layers/quantization/awq_marlin.py +793 -0
  698. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  699. vllm/model_executor/layers/quantization/base_config.py +170 -0
  700. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  701. vllm/model_executor/layers/quantization/bitsandbytes.py +626 -0
  702. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  703. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +986 -0
  704. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2645 -0
  705. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +35 -0
  706. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  707. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  708. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  709. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  710. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  711. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +176 -0
  712. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  713. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  714. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +200 -0
  715. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  716. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  717. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  718. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  719. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  720. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  721. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  722. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  723. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  724. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  725. vllm/model_executor/layers/quantization/cpu_wna16.py +625 -0
  726. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  727. vllm/model_executor/layers/quantization/experts_int8.py +207 -0
  728. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  729. vllm/model_executor/layers/quantization/fp8.py +1461 -0
  730. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  731. vllm/model_executor/layers/quantization/gguf.py +677 -0
  732. vllm/model_executor/layers/quantization/gptq.py +393 -0
  733. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  734. vllm/model_executor/layers/quantization/gptq_marlin.py +932 -0
  735. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  736. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  737. vllm/model_executor/layers/quantization/inc.py +65 -0
  738. vllm/model_executor/layers/quantization/input_quant_fp8.py +202 -0
  739. vllm/model_executor/layers/quantization/ipex_quant.py +487 -0
  740. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  741. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  742. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +109 -0
  743. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  744. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  745. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  746. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +130 -0
  747. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  748. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +161 -0
  749. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  750. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  751. vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py +97 -0
  752. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +76 -0
  753. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +81 -0
  754. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +128 -0
  755. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +220 -0
  756. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +147 -0
  757. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +71 -0
  758. vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +106 -0
  759. vllm/model_executor/layers/quantization/kv_cache.py +153 -0
  760. vllm/model_executor/layers/quantization/modelopt.py +1684 -0
  761. vllm/model_executor/layers/quantization/moe_wna16.py +516 -0
  762. vllm/model_executor/layers/quantization/mxfp4.py +1140 -0
  763. vllm/model_executor/layers/quantization/petit.py +319 -0
  764. vllm/model_executor/layers/quantization/ptpc_fp8.py +136 -0
  765. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  766. vllm/model_executor/layers/quantization/quark/quark.py +527 -0
  767. vllm/model_executor/layers/quantization/quark/quark_moe.py +622 -0
  768. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  769. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  770. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  771. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  772. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  773. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  774. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  775. vllm/model_executor/layers/quantization/rtn.py +621 -0
  776. vllm/model_executor/layers/quantization/schema.py +90 -0
  777. vllm/model_executor/layers/quantization/torchao.py +380 -0
  778. vllm/model_executor/layers/quantization/tpu_int8.py +139 -0
  779. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  780. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  781. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  782. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  783. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  784. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  785. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  786. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  787. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  788. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  789. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  790. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  791. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  792. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  793. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  794. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  795. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  796. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  797. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  798. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  799. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  800. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  801. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  802. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  803. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  804. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  805. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  806. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  807. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  808. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  809. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  810. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  811. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  812. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  996. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  997. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +412 -0
  998. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +312 -0
  999. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1453 -0
  1000. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1001. vllm/model_executor/layers/quantization/utils/int8_utils.py +474 -0
  1002. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1003. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1004. vllm/model_executor/layers/quantization/utils/marlin_utils.py +678 -0
  1005. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +452 -0
  1006. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +381 -0
  1007. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1008. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1009. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +189 -0
  1010. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1011. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1012. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1013. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1014. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1015. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1016. vllm/model_executor/layers/quantization/utils/quant_utils.py +741 -0
  1017. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +519 -0
  1018. vllm/model_executor/layers/resampler.py +283 -0
  1019. vllm/model_executor/layers/rotary_embedding/__init__.py +289 -0
  1020. vllm/model_executor/layers/rotary_embedding/base.py +254 -0
  1021. vllm/model_executor/layers/rotary_embedding/common.py +279 -0
  1022. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +165 -0
  1023. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +215 -0
  1024. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1025. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1026. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +82 -0
  1027. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1028. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1029. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +80 -0
  1030. vllm/model_executor/layers/rotary_embedding/mrope.py +412 -0
  1031. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1032. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1033. vllm/model_executor/layers/rotary_embedding/xdrope.py +160 -0
  1034. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1035. vllm/model_executor/layers/utils.py +251 -0
  1036. vllm/model_executor/layers/vocab_parallel_embedding.py +558 -0
  1037. vllm/model_executor/model_loader/__init__.py +150 -0
  1038. vllm/model_executor/model_loader/base_loader.py +57 -0
  1039. vllm/model_executor/model_loader/bitsandbytes_loader.py +822 -0
  1040. vllm/model_executor/model_loader/default_loader.py +321 -0
  1041. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1042. vllm/model_executor/model_loader/gguf_loader.py +371 -0
  1043. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1044. vllm/model_executor/model_loader/runai_streamer_loader.py +116 -0
  1045. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1046. vllm/model_executor/model_loader/tensorizer.py +790 -0
  1047. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1048. vllm/model_executor/model_loader/tpu.py +118 -0
  1049. vllm/model_executor/model_loader/utils.py +292 -0
  1050. vllm/model_executor/model_loader/weight_utils.py +1157 -0
  1051. vllm/model_executor/models/__init__.py +44 -0
  1052. vllm/model_executor/models/adapters.py +522 -0
  1053. vllm/model_executor/models/afmoe.py +696 -0
  1054. vllm/model_executor/models/aimv2.py +248 -0
  1055. vllm/model_executor/models/apertus.py +565 -0
  1056. vllm/model_executor/models/arcee.py +428 -0
  1057. vllm/model_executor/models/arctic.py +633 -0
  1058. vllm/model_executor/models/aria.py +653 -0
  1059. vllm/model_executor/models/audioflamingo3.py +639 -0
  1060. vllm/model_executor/models/aya_vision.py +448 -0
  1061. vllm/model_executor/models/bagel.py +584 -0
  1062. vllm/model_executor/models/baichuan.py +493 -0
  1063. vllm/model_executor/models/bailing_moe.py +642 -0
  1064. vllm/model_executor/models/bamba.py +511 -0
  1065. vllm/model_executor/models/bee.py +157 -0
  1066. vllm/model_executor/models/bert.py +925 -0
  1067. vllm/model_executor/models/bert_with_rope.py +732 -0
  1068. vllm/model_executor/models/blip.py +350 -0
  1069. vllm/model_executor/models/blip2.py +693 -0
  1070. vllm/model_executor/models/bloom.py +390 -0
  1071. vllm/model_executor/models/chameleon.py +1095 -0
  1072. vllm/model_executor/models/chatglm.py +502 -0
  1073. vllm/model_executor/models/clip.py +1004 -0
  1074. vllm/model_executor/models/cohere2_vision.py +470 -0
  1075. vllm/model_executor/models/commandr.py +469 -0
  1076. vllm/model_executor/models/config.py +531 -0
  1077. vllm/model_executor/models/dbrx.py +484 -0
  1078. vllm/model_executor/models/deepencoder.py +676 -0
  1079. vllm/model_executor/models/deepseek_eagle.py +252 -0
  1080. vllm/model_executor/models/deepseek_mtp.py +446 -0
  1081. vllm/model_executor/models/deepseek_ocr.py +591 -0
  1082. vllm/model_executor/models/deepseek_v2.py +1710 -0
  1083. vllm/model_executor/models/deepseek_vl2.py +642 -0
  1084. vllm/model_executor/models/dots1.py +565 -0
  1085. vllm/model_executor/models/dots_ocr.py +821 -0
  1086. vllm/model_executor/models/ernie45.py +53 -0
  1087. vllm/model_executor/models/ernie45_moe.py +754 -0
  1088. vllm/model_executor/models/ernie45_vl.py +1621 -0
  1089. vllm/model_executor/models/ernie45_vl_moe.py +800 -0
  1090. vllm/model_executor/models/ernie_mtp.py +279 -0
  1091. vllm/model_executor/models/exaone.py +524 -0
  1092. vllm/model_executor/models/exaone4.py +516 -0
  1093. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1094. vllm/model_executor/models/falcon.py +543 -0
  1095. vllm/model_executor/models/falcon_h1.py +675 -0
  1096. vllm/model_executor/models/flex_olmo.py +155 -0
  1097. vllm/model_executor/models/fuyu.py +371 -0
  1098. vllm/model_executor/models/gemma.py +425 -0
  1099. vllm/model_executor/models/gemma2.py +435 -0
  1100. vllm/model_executor/models/gemma3.py +507 -0
  1101. vllm/model_executor/models/gemma3_mm.py +664 -0
  1102. vllm/model_executor/models/gemma3n.py +1166 -0
  1103. vllm/model_executor/models/gemma3n_mm.py +810 -0
  1104. vllm/model_executor/models/glm.py +24 -0
  1105. vllm/model_executor/models/glm4.py +295 -0
  1106. vllm/model_executor/models/glm4_1v.py +1808 -0
  1107. vllm/model_executor/models/glm4_moe.py +736 -0
  1108. vllm/model_executor/models/glm4_moe_mtp.py +359 -0
  1109. vllm/model_executor/models/glm4v.py +783 -0
  1110. vllm/model_executor/models/gpt2.py +397 -0
  1111. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1112. vllm/model_executor/models/gpt_j.py +346 -0
  1113. vllm/model_executor/models/gpt_neox.py +340 -0
  1114. vllm/model_executor/models/gpt_oss.py +744 -0
  1115. vllm/model_executor/models/granite.py +475 -0
  1116. vllm/model_executor/models/granite_speech.py +912 -0
  1117. vllm/model_executor/models/granitemoe.py +560 -0
  1118. vllm/model_executor/models/granitemoehybrid.py +703 -0
  1119. vllm/model_executor/models/granitemoeshared.py +328 -0
  1120. vllm/model_executor/models/gritlm.py +243 -0
  1121. vllm/model_executor/models/grok1.py +554 -0
  1122. vllm/model_executor/models/h2ovl.py +554 -0
  1123. vllm/model_executor/models/hunyuan_v1.py +1040 -0
  1124. vllm/model_executor/models/hunyuan_vision.py +1034 -0
  1125. vllm/model_executor/models/hyperclovax_vision.py +1164 -0
  1126. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1127. vllm/model_executor/models/idefics3.py +716 -0
  1128. vllm/model_executor/models/interfaces.py +1179 -0
  1129. vllm/model_executor/models/interfaces_base.py +228 -0
  1130. vllm/model_executor/models/intern_vit.py +454 -0
  1131. vllm/model_executor/models/internlm2.py +453 -0
  1132. vllm/model_executor/models/internlm2_ve.py +139 -0
  1133. vllm/model_executor/models/interns1.py +828 -0
  1134. vllm/model_executor/models/interns1_vit.py +433 -0
  1135. vllm/model_executor/models/internvl.py +1450 -0
  1136. vllm/model_executor/models/jais.py +397 -0
  1137. vllm/model_executor/models/jais2.py +529 -0
  1138. vllm/model_executor/models/jamba.py +609 -0
  1139. vllm/model_executor/models/jina_vl.py +147 -0
  1140. vllm/model_executor/models/keye.py +1706 -0
  1141. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1142. vllm/model_executor/models/kimi_linear.py +658 -0
  1143. vllm/model_executor/models/kimi_vl.py +576 -0
  1144. vllm/model_executor/models/lfm2.py +515 -0
  1145. vllm/model_executor/models/lfm2_moe.py +745 -0
  1146. vllm/model_executor/models/lightonocr.py +195 -0
  1147. vllm/model_executor/models/llama.py +700 -0
  1148. vllm/model_executor/models/llama4.py +856 -0
  1149. vllm/model_executor/models/llama4_eagle.py +225 -0
  1150. vllm/model_executor/models/llama_eagle.py +213 -0
  1151. vllm/model_executor/models/llama_eagle3.py +375 -0
  1152. vllm/model_executor/models/llava.py +840 -0
  1153. vllm/model_executor/models/llava_next.py +581 -0
  1154. vllm/model_executor/models/llava_next_video.py +465 -0
  1155. vllm/model_executor/models/llava_onevision.py +921 -0
  1156. vllm/model_executor/models/longcat_flash.py +743 -0
  1157. vllm/model_executor/models/longcat_flash_mtp.py +349 -0
  1158. vllm/model_executor/models/mamba.py +276 -0
  1159. vllm/model_executor/models/mamba2.py +288 -0
  1160. vllm/model_executor/models/medusa.py +179 -0
  1161. vllm/model_executor/models/midashenglm.py +826 -0
  1162. vllm/model_executor/models/mimo.py +188 -0
  1163. vllm/model_executor/models/mimo_mtp.py +294 -0
  1164. vllm/model_executor/models/minicpm.py +656 -0
  1165. vllm/model_executor/models/minicpm3.py +233 -0
  1166. vllm/model_executor/models/minicpm_eagle.py +385 -0
  1167. vllm/model_executor/models/minicpmo.py +768 -0
  1168. vllm/model_executor/models/minicpmv.py +1742 -0
  1169. vllm/model_executor/models/minimax_m2.py +550 -0
  1170. vllm/model_executor/models/minimax_text_01.py +1007 -0
  1171. vllm/model_executor/models/minimax_vl_01.py +394 -0
  1172. vllm/model_executor/models/mistral3.py +635 -0
  1173. vllm/model_executor/models/mistral_large_3.py +63 -0
  1174. vllm/model_executor/models/mistral_large_3_eagle.py +136 -0
  1175. vllm/model_executor/models/mixtral.py +598 -0
  1176. vllm/model_executor/models/mllama4.py +1149 -0
  1177. vllm/model_executor/models/mlp_speculator.py +235 -0
  1178. vllm/model_executor/models/modernbert.py +451 -0
  1179. vllm/model_executor/models/module_mapping.py +74 -0
  1180. vllm/model_executor/models/molmo.py +1550 -0
  1181. vllm/model_executor/models/moonvit.py +686 -0
  1182. vllm/model_executor/models/mpt.py +335 -0
  1183. vllm/model_executor/models/nano_nemotron_vl.py +1730 -0
  1184. vllm/model_executor/models/nemotron.py +499 -0
  1185. vllm/model_executor/models/nemotron_h.py +900 -0
  1186. vllm/model_executor/models/nemotron_nas.py +471 -0
  1187. vllm/model_executor/models/nemotron_vl.py +651 -0
  1188. vllm/model_executor/models/nvlm_d.py +216 -0
  1189. vllm/model_executor/models/olmo.py +412 -0
  1190. vllm/model_executor/models/olmo2.py +454 -0
  1191. vllm/model_executor/models/olmoe.py +493 -0
  1192. vllm/model_executor/models/opencua.py +262 -0
  1193. vllm/model_executor/models/openpangu.py +1049 -0
  1194. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1195. vllm/model_executor/models/opt.py +426 -0
  1196. vllm/model_executor/models/orion.py +365 -0
  1197. vllm/model_executor/models/ouro.py +507 -0
  1198. vllm/model_executor/models/ovis.py +557 -0
  1199. vllm/model_executor/models/ovis2_5.py +661 -0
  1200. vllm/model_executor/models/paddleocr_vl.py +1300 -0
  1201. vllm/model_executor/models/paligemma.py +408 -0
  1202. vllm/model_executor/models/persimmon.py +373 -0
  1203. vllm/model_executor/models/phi.py +363 -0
  1204. vllm/model_executor/models/phi3.py +18 -0
  1205. vllm/model_executor/models/phi3v.py +729 -0
  1206. vllm/model_executor/models/phi4mm.py +1251 -0
  1207. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1208. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1209. vllm/model_executor/models/phimoe.py +669 -0
  1210. vllm/model_executor/models/pixtral.py +1379 -0
  1211. vllm/model_executor/models/plamo2.py +965 -0
  1212. vllm/model_executor/models/plamo3.py +440 -0
  1213. vllm/model_executor/models/qwen.py +365 -0
  1214. vllm/model_executor/models/qwen2.py +600 -0
  1215. vllm/model_executor/models/qwen2_5_omni_thinker.py +1219 -0
  1216. vllm/model_executor/models/qwen2_5_vl.py +1569 -0
  1217. vllm/model_executor/models/qwen2_audio.py +471 -0
  1218. vllm/model_executor/models/qwen2_moe.py +597 -0
  1219. vllm/model_executor/models/qwen2_rm.py +123 -0
  1220. vllm/model_executor/models/qwen2_vl.py +1568 -0
  1221. vllm/model_executor/models/qwen3.py +331 -0
  1222. vllm/model_executor/models/qwen3_moe.py +751 -0
  1223. vllm/model_executor/models/qwen3_next.py +1395 -0
  1224. vllm/model_executor/models/qwen3_next_mtp.py +296 -0
  1225. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1793 -0
  1226. vllm/model_executor/models/qwen3_vl.py +2092 -0
  1227. vllm/model_executor/models/qwen3_vl_moe.py +474 -0
  1228. vllm/model_executor/models/qwen_vl.py +801 -0
  1229. vllm/model_executor/models/radio.py +555 -0
  1230. vllm/model_executor/models/registry.py +1189 -0
  1231. vllm/model_executor/models/roberta.py +259 -0
  1232. vllm/model_executor/models/rvl.py +107 -0
  1233. vllm/model_executor/models/seed_oss.py +492 -0
  1234. vllm/model_executor/models/siglip.py +1244 -0
  1235. vllm/model_executor/models/siglip2navit.py +658 -0
  1236. vllm/model_executor/models/skyworkr1v.py +951 -0
  1237. vllm/model_executor/models/smolvlm.py +38 -0
  1238. vllm/model_executor/models/solar.py +484 -0
  1239. vllm/model_executor/models/stablelm.py +354 -0
  1240. vllm/model_executor/models/starcoder2.py +365 -0
  1241. vllm/model_executor/models/step3_text.py +554 -0
  1242. vllm/model_executor/models/step3_vl.py +1147 -0
  1243. vllm/model_executor/models/swin.py +514 -0
  1244. vllm/model_executor/models/tarsier.py +617 -0
  1245. vllm/model_executor/models/telechat2.py +153 -0
  1246. vllm/model_executor/models/teleflm.py +78 -0
  1247. vllm/model_executor/models/terratorch.py +318 -0
  1248. vllm/model_executor/models/transformers/__init__.py +127 -0
  1249. vllm/model_executor/models/transformers/base.py +518 -0
  1250. vllm/model_executor/models/transformers/causal.py +65 -0
  1251. vllm/model_executor/models/transformers/legacy.py +90 -0
  1252. vllm/model_executor/models/transformers/moe.py +325 -0
  1253. vllm/model_executor/models/transformers/multimodal.py +411 -0
  1254. vllm/model_executor/models/transformers/pooling.py +119 -0
  1255. vllm/model_executor/models/transformers/utils.py +213 -0
  1256. vllm/model_executor/models/ultravox.py +766 -0
  1257. vllm/model_executor/models/utils.py +832 -0
  1258. vllm/model_executor/models/vision.py +546 -0
  1259. vllm/model_executor/models/voxtral.py +841 -0
  1260. vllm/model_executor/models/whisper.py +971 -0
  1261. vllm/model_executor/models/zamba2.py +979 -0
  1262. vllm/model_executor/parameter.py +642 -0
  1263. vllm/model_executor/utils.py +119 -0
  1264. vllm/model_executor/warmup/__init__.py +0 -0
  1265. vllm/model_executor/warmup/deep_gemm_warmup.py +314 -0
  1266. vllm/model_executor/warmup/kernel_warmup.py +98 -0
  1267. vllm/multimodal/__init__.py +40 -0
  1268. vllm/multimodal/audio.py +147 -0
  1269. vllm/multimodal/base.py +56 -0
  1270. vllm/multimodal/cache.py +823 -0
  1271. vllm/multimodal/evs.py +294 -0
  1272. vllm/multimodal/hasher.py +120 -0
  1273. vllm/multimodal/image.py +142 -0
  1274. vllm/multimodal/inputs.py +1089 -0
  1275. vllm/multimodal/parse.py +565 -0
  1276. vllm/multimodal/processing.py +2240 -0
  1277. vllm/multimodal/profiling.py +351 -0
  1278. vllm/multimodal/registry.py +357 -0
  1279. vllm/multimodal/utils.py +513 -0
  1280. vllm/multimodal/video.py +340 -0
  1281. vllm/outputs.py +345 -0
  1282. vllm/platforms/__init__.py +277 -0
  1283. vllm/platforms/cpu.py +421 -0
  1284. vllm/platforms/cuda.py +618 -0
  1285. vllm/platforms/interface.py +695 -0
  1286. vllm/platforms/rocm.py +564 -0
  1287. vllm/platforms/tpu.py +295 -0
  1288. vllm/platforms/xpu.py +277 -0
  1289. vllm/plugins/__init__.py +81 -0
  1290. vllm/plugins/io_processors/__init__.py +68 -0
  1291. vllm/plugins/io_processors/interface.py +77 -0
  1292. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1293. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1294. vllm/pooling_params.py +230 -0
  1295. vllm/profiler/__init__.py +0 -0
  1296. vllm/profiler/layerwise_profile.py +392 -0
  1297. vllm/profiler/utils.py +151 -0
  1298. vllm/profiler/wrapper.py +241 -0
  1299. vllm/py.typed +2 -0
  1300. vllm/ray/__init__.py +0 -0
  1301. vllm/ray/lazy_utils.py +30 -0
  1302. vllm/ray/ray_env.py +79 -0
  1303. vllm/reasoning/__init__.py +96 -0
  1304. vllm/reasoning/abs_reasoning_parsers.py +318 -0
  1305. vllm/reasoning/basic_parsers.py +175 -0
  1306. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1307. vllm/reasoning/deepseek_v3_reasoning_parser.py +67 -0
  1308. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1309. vllm/reasoning/glm4_moe_reasoning_parser.py +171 -0
  1310. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1311. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1312. vllm/reasoning/holo2_reasoning_parser.py +88 -0
  1313. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1314. vllm/reasoning/identity_reasoning_parser.py +63 -0
  1315. vllm/reasoning/minimax_m2_reasoning_parser.py +110 -0
  1316. vllm/reasoning/mistral_reasoning_parser.py +154 -0
  1317. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1318. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1319. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1320. vllm/reasoning/step3_reasoning_parser.py +107 -0
  1321. vllm/sampling_params.py +597 -0
  1322. vllm/scalar_type.py +355 -0
  1323. vllm/scripts.py +17 -0
  1324. vllm/sequence.py +98 -0
  1325. vllm/tasks.py +13 -0
  1326. vllm/third_party/__init__.py +0 -0
  1327. vllm/third_party/pynvml.py +6140 -0
  1328. vllm/tokenizers/__init__.py +20 -0
  1329. vllm/tokenizers/deepseek_v32.py +175 -0
  1330. vllm/tokenizers/deepseek_v32_encoding.py +459 -0
  1331. vllm/tokenizers/detokenizer_utils.py +198 -0
  1332. vllm/tokenizers/hf.py +119 -0
  1333. vllm/tokenizers/mistral.py +567 -0
  1334. vllm/tokenizers/protocol.py +114 -0
  1335. vllm/tokenizers/registry.py +233 -0
  1336. vllm/tool_parsers/__init__.py +150 -0
  1337. vllm/tool_parsers/abstract_tool_parser.py +273 -0
  1338. vllm/tool_parsers/deepseekv31_tool_parser.py +388 -0
  1339. vllm/tool_parsers/deepseekv32_tool_parser.py +591 -0
  1340. vllm/tool_parsers/deepseekv3_tool_parser.py +390 -0
  1341. vllm/tool_parsers/ernie45_tool_parser.py +210 -0
  1342. vllm/tool_parsers/gigachat3_tool_parser.py +190 -0
  1343. vllm/tool_parsers/glm4_moe_tool_parser.py +200 -0
  1344. vllm/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  1345. vllm/tool_parsers/granite_tool_parser.py +253 -0
  1346. vllm/tool_parsers/hermes_tool_parser.py +495 -0
  1347. vllm/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  1348. vllm/tool_parsers/internlm2_tool_parser.py +227 -0
  1349. vllm/tool_parsers/jamba_tool_parser.py +323 -0
  1350. vllm/tool_parsers/kimi_k2_tool_parser.py +590 -0
  1351. vllm/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  1352. vllm/tool_parsers/llama_tool_parser.py +324 -0
  1353. vllm/tool_parsers/longcat_tool_parser.py +37 -0
  1354. vllm/tool_parsers/minimax_m2_tool_parser.py +643 -0
  1355. vllm/tool_parsers/minimax_tool_parser.py +849 -0
  1356. vllm/tool_parsers/mistral_tool_parser.py +585 -0
  1357. vllm/tool_parsers/olmo3_tool_parser.py +366 -0
  1358. vllm/tool_parsers/openai_tool_parser.py +102 -0
  1359. vllm/tool_parsers/phi4mini_tool_parser.py +120 -0
  1360. vllm/tool_parsers/pythonic_tool_parser.py +332 -0
  1361. vllm/tool_parsers/qwen3coder_tool_parser.py +781 -0
  1362. vllm/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  1363. vllm/tool_parsers/seed_oss_tool_parser.py +744 -0
  1364. vllm/tool_parsers/step3_tool_parser.py +303 -0
  1365. vllm/tool_parsers/utils.py +229 -0
  1366. vllm/tool_parsers/xlam_tool_parser.py +556 -0
  1367. vllm/tracing.py +135 -0
  1368. vllm/transformers_utils/__init__.py +26 -0
  1369. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1370. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1371. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1372. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1373. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1374. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1375. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1376. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1377. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1378. vllm/transformers_utils/config.py +1144 -0
  1379. vllm/transformers_utils/config_parser_base.py +20 -0
  1380. vllm/transformers_utils/configs/__init__.py +102 -0
  1381. vllm/transformers_utils/configs/afmoe.py +87 -0
  1382. vllm/transformers_utils/configs/arctic.py +216 -0
  1383. vllm/transformers_utils/configs/bagel.py +53 -0
  1384. vllm/transformers_utils/configs/chatglm.py +75 -0
  1385. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1386. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1387. vllm/transformers_utils/configs/eagle.py +90 -0
  1388. vllm/transformers_utils/configs/falcon.py +89 -0
  1389. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1390. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1391. vllm/transformers_utils/configs/jais.py +243 -0
  1392. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1393. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1394. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1395. vllm/transformers_utils/configs/medusa.py +65 -0
  1396. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1397. vllm/transformers_utils/configs/mistral.py +235 -0
  1398. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1399. vllm/transformers_utils/configs/moonvit.py +33 -0
  1400. vllm/transformers_utils/configs/nemotron.py +220 -0
  1401. vllm/transformers_utils/configs/nemotron_h.py +284 -0
  1402. vllm/transformers_utils/configs/olmo3.py +83 -0
  1403. vllm/transformers_utils/configs/ovis.py +182 -0
  1404. vllm/transformers_utils/configs/qwen3_next.py +277 -0
  1405. vllm/transformers_utils/configs/radio.py +89 -0
  1406. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1407. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1408. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1409. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1410. vllm/transformers_utils/configs/tarsier2.py +24 -0
  1411. vllm/transformers_utils/configs/ultravox.py +120 -0
  1412. vllm/transformers_utils/dynamic_module.py +59 -0
  1413. vllm/transformers_utils/gguf_utils.py +280 -0
  1414. vllm/transformers_utils/processor.py +424 -0
  1415. vllm/transformers_utils/processors/__init__.py +25 -0
  1416. vllm/transformers_utils/processors/bagel.py +73 -0
  1417. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1418. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1419. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1420. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1421. vllm/transformers_utils/processors/ovis.py +453 -0
  1422. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1423. vllm/transformers_utils/repo_utils.py +287 -0
  1424. vllm/transformers_utils/runai_utils.py +102 -0
  1425. vllm/transformers_utils/s3_utils.py +95 -0
  1426. vllm/transformers_utils/tokenizer.py +127 -0
  1427. vllm/transformers_utils/tokenizer_base.py +33 -0
  1428. vllm/transformers_utils/utils.py +112 -0
  1429. vllm/triton_utils/__init__.py +20 -0
  1430. vllm/triton_utils/importing.py +103 -0
  1431. vllm/usage/__init__.py +0 -0
  1432. vllm/usage/usage_lib.py +294 -0
  1433. vllm/utils/__init__.py +66 -0
  1434. vllm/utils/argparse_utils.py +492 -0
  1435. vllm/utils/async_utils.py +310 -0
  1436. vllm/utils/cache.py +214 -0
  1437. vllm/utils/collection_utils.py +112 -0
  1438. vllm/utils/counter.py +45 -0
  1439. vllm/utils/deep_gemm.py +400 -0
  1440. vllm/utils/flashinfer.py +528 -0
  1441. vllm/utils/func_utils.py +236 -0
  1442. vllm/utils/gc_utils.py +151 -0
  1443. vllm/utils/hashing.py +117 -0
  1444. vllm/utils/import_utils.py +449 -0
  1445. vllm/utils/jsontree.py +158 -0
  1446. vllm/utils/math_utils.py +32 -0
  1447. vllm/utils/mem_constants.py +13 -0
  1448. vllm/utils/mem_utils.py +232 -0
  1449. vllm/utils/nccl.py +64 -0
  1450. vllm/utils/network_utils.py +331 -0
  1451. vllm/utils/nvtx_pytorch_hooks.py +286 -0
  1452. vllm/utils/platform_utils.py +59 -0
  1453. vllm/utils/profiling.py +56 -0
  1454. vllm/utils/registry.py +51 -0
  1455. vllm/utils/serial_utils.py +214 -0
  1456. vllm/utils/system_utils.py +269 -0
  1457. vllm/utils/tensor_schema.py +255 -0
  1458. vllm/utils/torch_utils.py +648 -0
  1459. vllm/v1/__init__.py +0 -0
  1460. vllm/v1/attention/__init__.py +0 -0
  1461. vllm/v1/attention/backends/__init__.py +0 -0
  1462. vllm/v1/attention/backends/cpu_attn.py +497 -0
  1463. vllm/v1/attention/backends/flash_attn.py +1051 -0
  1464. vllm/v1/attention/backends/flashinfer.py +1575 -0
  1465. vllm/v1/attention/backends/flex_attention.py +1028 -0
  1466. vllm/v1/attention/backends/gdn_attn.py +375 -0
  1467. vllm/v1/attention/backends/linear_attn.py +77 -0
  1468. vllm/v1/attention/backends/mamba1_attn.py +159 -0
  1469. vllm/v1/attention/backends/mamba2_attn.py +348 -0
  1470. vllm/v1/attention/backends/mamba_attn.py +117 -0
  1471. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1472. vllm/v1/attention/backends/mla/aiter_triton_mla.py +74 -0
  1473. vllm/v1/attention/backends/mla/common.py +2114 -0
  1474. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1475. vllm/v1/attention/backends/mla/flashattn_mla.py +342 -0
  1476. vllm/v1/attention/backends/mla/flashinfer_mla.py +174 -0
  1477. vllm/v1/attention/backends/mla/flashmla.py +317 -0
  1478. vllm/v1/attention/backends/mla/flashmla_sparse.py +1020 -0
  1479. vllm/v1/attention/backends/mla/indexer.py +345 -0
  1480. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +275 -0
  1481. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +325 -0
  1482. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1483. vllm/v1/attention/backends/pallas.py +436 -0
  1484. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1485. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1486. vllm/v1/attention/backends/rocm_attn.py +359 -0
  1487. vllm/v1/attention/backends/short_conv_attn.py +104 -0
  1488. vllm/v1/attention/backends/tree_attn.py +428 -0
  1489. vllm/v1/attention/backends/triton_attn.py +497 -0
  1490. vllm/v1/attention/backends/utils.py +1212 -0
  1491. vllm/v1/core/__init__.py +0 -0
  1492. vllm/v1/core/block_pool.py +485 -0
  1493. vllm/v1/core/encoder_cache_manager.py +402 -0
  1494. vllm/v1/core/kv_cache_coordinator.py +570 -0
  1495. vllm/v1/core/kv_cache_manager.py +419 -0
  1496. vllm/v1/core/kv_cache_metrics.py +96 -0
  1497. vllm/v1/core/kv_cache_utils.py +1476 -0
  1498. vllm/v1/core/sched/__init__.py +0 -0
  1499. vllm/v1/core/sched/async_scheduler.py +68 -0
  1500. vllm/v1/core/sched/interface.py +189 -0
  1501. vllm/v1/core/sched/output.py +230 -0
  1502. vllm/v1/core/sched/request_queue.py +217 -0
  1503. vllm/v1/core/sched/scheduler.py +1826 -0
  1504. vllm/v1/core/sched/utils.py +64 -0
  1505. vllm/v1/core/single_type_kv_cache_manager.py +801 -0
  1506. vllm/v1/cudagraph_dispatcher.py +183 -0
  1507. vllm/v1/engine/__init__.py +217 -0
  1508. vllm/v1/engine/async_llm.py +866 -0
  1509. vllm/v1/engine/coordinator.py +377 -0
  1510. vllm/v1/engine/core.py +1455 -0
  1511. vllm/v1/engine/core_client.py +1416 -0
  1512. vllm/v1/engine/detokenizer.py +351 -0
  1513. vllm/v1/engine/exceptions.py +18 -0
  1514. vllm/v1/engine/input_processor.py +643 -0
  1515. vllm/v1/engine/llm_engine.py +414 -0
  1516. vllm/v1/engine/logprobs.py +189 -0
  1517. vllm/v1/engine/output_processor.py +659 -0
  1518. vllm/v1/engine/parallel_sampling.py +145 -0
  1519. vllm/v1/engine/processor.py +20 -0
  1520. vllm/v1/engine/utils.py +1068 -0
  1521. vllm/v1/executor/__init__.py +6 -0
  1522. vllm/v1/executor/abstract.py +352 -0
  1523. vllm/v1/executor/multiproc_executor.py +890 -0
  1524. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1525. vllm/v1/executor/ray_executor.py +626 -0
  1526. vllm/v1/executor/ray_utils.py +465 -0
  1527. vllm/v1/executor/uniproc_executor.py +186 -0
  1528. vllm/v1/kv_cache_interface.py +404 -0
  1529. vllm/v1/kv_offload/__init__.py +0 -0
  1530. vllm/v1/kv_offload/abstract.py +161 -0
  1531. vllm/v1/kv_offload/arc_manager.py +237 -0
  1532. vllm/v1/kv_offload/backend.py +97 -0
  1533. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1534. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1535. vllm/v1/kv_offload/cpu.py +86 -0
  1536. vllm/v1/kv_offload/factory.py +56 -0
  1537. vllm/v1/kv_offload/lru_manager.py +139 -0
  1538. vllm/v1/kv_offload/mediums.py +39 -0
  1539. vllm/v1/kv_offload/spec.py +66 -0
  1540. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1541. vllm/v1/kv_offload/worker/cpu_gpu.py +280 -0
  1542. vllm/v1/kv_offload/worker/worker.py +144 -0
  1543. vllm/v1/metrics/__init__.py +0 -0
  1544. vllm/v1/metrics/loggers.py +1305 -0
  1545. vllm/v1/metrics/prometheus.py +82 -0
  1546. vllm/v1/metrics/ray_wrappers.py +194 -0
  1547. vllm/v1/metrics/reader.py +257 -0
  1548. vllm/v1/metrics/stats.py +437 -0
  1549. vllm/v1/outputs.py +245 -0
  1550. vllm/v1/pool/__init__.py +0 -0
  1551. vllm/v1/pool/metadata.py +126 -0
  1552. vllm/v1/request.py +282 -0
  1553. vllm/v1/sample/__init__.py +0 -0
  1554. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1555. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1556. vllm/v1/sample/logits_processor/interface.py +106 -0
  1557. vllm/v1/sample/logits_processor/state.py +165 -0
  1558. vllm/v1/sample/metadata.py +44 -0
  1559. vllm/v1/sample/ops/__init__.py +0 -0
  1560. vllm/v1/sample/ops/bad_words.py +52 -0
  1561. vllm/v1/sample/ops/logprobs.py +25 -0
  1562. vllm/v1/sample/ops/penalties.py +57 -0
  1563. vllm/v1/sample/ops/topk_topp_sampler.py +384 -0
  1564. vllm/v1/sample/rejection_sampler.py +805 -0
  1565. vllm/v1/sample/sampler.py +319 -0
  1566. vllm/v1/sample/tpu/__init__.py +0 -0
  1567. vllm/v1/sample/tpu/metadata.py +120 -0
  1568. vllm/v1/sample/tpu/sampler.py +215 -0
  1569. vllm/v1/serial_utils.py +514 -0
  1570. vllm/v1/spec_decode/__init__.py +0 -0
  1571. vllm/v1/spec_decode/eagle.py +1331 -0
  1572. vllm/v1/spec_decode/medusa.py +73 -0
  1573. vllm/v1/spec_decode/metadata.py +66 -0
  1574. vllm/v1/spec_decode/metrics.py +225 -0
  1575. vllm/v1/spec_decode/ngram_proposer.py +291 -0
  1576. vllm/v1/spec_decode/suffix_decoding.py +101 -0
  1577. vllm/v1/spec_decode/utils.py +121 -0
  1578. vllm/v1/structured_output/__init__.py +353 -0
  1579. vllm/v1/structured_output/backend_guidance.py +265 -0
  1580. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1581. vllm/v1/structured_output/backend_outlines.py +324 -0
  1582. vllm/v1/structured_output/backend_types.py +136 -0
  1583. vllm/v1/structured_output/backend_xgrammar.py +378 -0
  1584. vllm/v1/structured_output/request.py +94 -0
  1585. vllm/v1/structured_output/utils.py +469 -0
  1586. vllm/v1/utils.py +414 -0
  1587. vllm/v1/worker/__init__.py +0 -0
  1588. vllm/v1/worker/block_table.py +343 -0
  1589. vllm/v1/worker/cp_utils.py +42 -0
  1590. vllm/v1/worker/cpu_model_runner.py +122 -0
  1591. vllm/v1/worker/cpu_worker.py +192 -0
  1592. vllm/v1/worker/dp_utils.py +240 -0
  1593. vllm/v1/worker/ec_connector_model_runner_mixin.py +87 -0
  1594. vllm/v1/worker/gpu/README.md +4 -0
  1595. vllm/v1/worker/gpu/__init__.py +0 -0
  1596. vllm/v1/worker/gpu/async_utils.py +98 -0
  1597. vllm/v1/worker/gpu/attn_utils.py +189 -0
  1598. vllm/v1/worker/gpu/block_table.py +314 -0
  1599. vllm/v1/worker/gpu/cudagraph_utils.py +259 -0
  1600. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1601. vllm/v1/worker/gpu/input_batch.py +479 -0
  1602. vllm/v1/worker/gpu/metrics/__init__.py +0 -0
  1603. vllm/v1/worker/gpu/metrics/logits.py +42 -0
  1604. vllm/v1/worker/gpu/model_runner.py +1006 -0
  1605. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1606. vllm/v1/worker/gpu/sample/gumbel.py +101 -0
  1607. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1608. vllm/v1/worker/gpu/sample/metadata.py +192 -0
  1609. vllm/v1/worker/gpu/sample/min_p.py +51 -0
  1610. vllm/v1/worker/gpu/sample/output.py +14 -0
  1611. vllm/v1/worker/gpu/sample/penalties.py +155 -0
  1612. vllm/v1/worker/gpu/sample/sampler.py +87 -0
  1613. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1614. vllm/v1/worker/gpu/spec_decode/eagle.py +565 -0
  1615. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1616. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +71 -0
  1617. vllm/v1/worker/gpu/states.py +316 -0
  1618. vllm/v1/worker/gpu/structured_outputs.py +76 -0
  1619. vllm/v1/worker/gpu_input_batch.py +990 -0
  1620. vllm/v1/worker/gpu_model_runner.py +5470 -0
  1621. vllm/v1/worker/gpu_ubatch_wrapper.py +472 -0
  1622. vllm/v1/worker/gpu_worker.py +955 -0
  1623. vllm/v1/worker/kv_connector_model_runner_mixin.py +302 -0
  1624. vllm/v1/worker/lora_model_runner_mixin.py +212 -0
  1625. vllm/v1/worker/tpu_input_batch.py +583 -0
  1626. vllm/v1/worker/tpu_model_runner.py +2191 -0
  1627. vllm/v1/worker/tpu_worker.py +352 -0
  1628. vllm/v1/worker/ubatch_utils.py +109 -0
  1629. vllm/v1/worker/ubatching.py +231 -0
  1630. vllm/v1/worker/utils.py +375 -0
  1631. vllm/v1/worker/worker_base.py +377 -0
  1632. vllm/v1/worker/workspace.py +253 -0
  1633. vllm/v1/worker/xpu_model_runner.py +48 -0
  1634. vllm/v1/worker/xpu_worker.py +174 -0
  1635. vllm/version.py +39 -0
  1636. vllm/vllm_flash_attn/.gitkeep +0 -0
  1637. vllm_cpu_avx512vnni-0.13.0.dist-info/METADATA +339 -0
  1638. vllm_cpu_avx512vnni-0.13.0.dist-info/RECORD +1641 -0
  1639. vllm_cpu_avx512vnni-0.13.0.dist-info/WHEEL +5 -0
  1640. vllm_cpu_avx512vnni-0.13.0.dist-info/entry_points.txt +5 -0
  1641. vllm_cpu_avx512vnni-0.13.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3228 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """
4
+ This module defines a framework for sampling benchmark requests from various
5
+ datasets. Each dataset subclass of BenchmarkDataset must implement sample
6
+ generation. Supported dataset types include:
7
+ - ShareGPT
8
+ - Random (synthetic)
9
+ - Sonnet
10
+ - BurstGPT
11
+ - HuggingFace
12
+ - VisionArena
13
+ """
14
+
15
+ import argparse
16
+ import ast
17
+ import base64
18
+ import io
19
+ import json
20
+ import logging
21
+ import math
22
+ import random
23
+ from abc import ABC, abstractmethod
24
+ from collections.abc import Callable, Iterator, Mapping
25
+ from contextlib import suppress
26
+ from copy import deepcopy
27
+ from dataclasses import dataclass
28
+ from functools import cache
29
+ from io import BytesIO
30
+ from tempfile import NamedTemporaryFile
31
+ from typing import Any, cast
32
+
33
+ import numpy as np
34
+ from PIL import Image
35
+ from typing_extensions import deprecated
36
+
37
+ from vllm.lora.request import LoRARequest
38
+ from vllm.lora.utils import get_adapter_absolute_path
39
+ from vllm.multimodal import MultiModalDataDict
40
+ from vllm.multimodal.image import convert_image_mode
41
+ from vllm.tokenizers import TokenizerLike
42
+ from vllm.utils.import_utils import PlaceholderModule
43
+
44
+ try:
45
+ from datasets import load_dataset
46
+ except ImportError:
47
+ datasets = PlaceholderModule("datasets")
48
+ load_dataset = datasets.placeholder_attr("load_dataset")
49
+
50
+ try:
51
+ import pandas as pd
52
+ except ImportError:
53
+ pd = PlaceholderModule("pandas")
54
+
55
+ try:
56
+ import librosa
57
+ except ImportError:
58
+ librosa = PlaceholderModule("librosa")
59
+
60
+ try:
61
+ from vllm.utils.argparse_utils import FlexibleArgumentParser
62
+ except ImportError:
63
+ from argparse import ArgumentParser as FlexibleArgumentParser
64
+
65
+ logger = logging.getLogger(__name__)
66
+
67
+ # -----------------------------------------------------------------------------
68
+ # Data Classes
69
+ # -----------------------------------------------------------------------------
70
+
71
+
72
+ @dataclass
73
+ class SampleRequest:
74
+ """
75
+ Represents a single inference request for benchmarking.
76
+ """
77
+
78
+ prompt: str | list[str]
79
+ prompt_len: int
80
+ expected_output_len: int
81
+ multi_modal_data: MultiModalDataDict | dict | list[dict] | None = None
82
+ lora_request: LoRARequest | None = None
83
+ request_id: str | None = None
84
+
85
+
86
+ # -----------------------------------------------------------------------------
87
+ # Benchmark Dataset Base Class
88
+ # -----------------------------------------------------------------------------
89
+
90
+
91
+ class BenchmarkDataset(ABC):
92
+ DEFAULT_SEED = 0
93
+ IS_MULTIMODAL = False
94
+
95
+ def __init__(
96
+ self,
97
+ dataset_path: str | None = None,
98
+ random_seed: int = DEFAULT_SEED,
99
+ disable_shuffle: bool = False,
100
+ **kwargs,
101
+ ) -> None:
102
+ """
103
+ Initialize the BenchmarkDataset with an optional dataset path and random
104
+ seed.
105
+
106
+ Args:
107
+ dataset_path (Optional[str]): Path to the dataset. If None, it
108
+ indicates that a default or random dataset might be used.
109
+ random_seed (int): Seed value for reproducible shuffling or
110
+ sampling. Defaults to DEFAULT_SEED.
111
+ """
112
+ self.dataset_path = dataset_path
113
+ # Set the random seed, ensuring that a None value is replaced with the
114
+ # default seed.
115
+ self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
116
+ self.disable_shuffle = disable_shuffle
117
+ self.data = None
118
+
119
+ def apply_multimodal_chat_transformation(
120
+ self,
121
+ prompt: str,
122
+ mm_content: MultiModalDataDict | dict | list[dict] | None = None,
123
+ ) -> list[dict]:
124
+ """
125
+ Transform a prompt and optional multimodal content into a chat format.
126
+ This method is used for chat models that expect a specific conversation
127
+ format.
128
+ """
129
+ content = [{"text": prompt, "type": "text"}]
130
+ if mm_content is not None:
131
+ if isinstance(mm_content, list):
132
+ content.extend(cast(list[dict[str, Any]], mm_content))
133
+ elif isinstance(mm_content, dict):
134
+ content.append(mm_content)
135
+ else:
136
+ raise TypeError(
137
+ "Could not process multimodal content of type: "
138
+ + f"{type(mm_content)}"
139
+ )
140
+ return [{"role": "user", "content": content}]
141
+
142
+ def load_data(self) -> None:
143
+ """
144
+ Load data from the dataset path into self.data.
145
+
146
+ This method must be overridden by subclasses since the method to load
147
+ data will vary depending on the dataset format and source.
148
+
149
+ Raises:
150
+ NotImplementedError: If a subclass does not implement this method.
151
+ """
152
+ # TODO (jenniferzhao): add support for downloading data
153
+ raise NotImplementedError("load_data must be implemented in subclasses.")
154
+
155
+ def get_random_lora_request(
156
+ self,
157
+ max_loras: int | None = None,
158
+ lora_path: str | None = None,
159
+ ) -> LoRARequest | None:
160
+ """
161
+ Optionally select a random LoRA request.
162
+
163
+ This method is used when LoRA parameters are provided. It randomly
164
+ selects a LoRA based on max_loras.
165
+
166
+ Args:
167
+ max_loras (Optional[int]): The maximum number of LoRAs available.
168
+ If `None`, LoRA is not used.
169
+ lora_path (Optional[str]): Path to the LoRA parameters on disk.
170
+ If `None`, LoRA is not used.
171
+
172
+ Returns:
173
+ A new [`LoRARequest`][vllm.lora.request.LoRARequest]
174
+ (or `None` if not applicable).
175
+ """
176
+ if max_loras is None or lora_path is None:
177
+ return None
178
+
179
+ # Generate a random LoRA ID in the range [1, max_loras].
180
+ lora_id = random.randint(1, max_loras)
181
+ lora_request = LoRARequest(
182
+ lora_name=str(lora_id),
183
+ lora_int_id=lora_id,
184
+ lora_path=lora_path_on_disk(lora_path),
185
+ )
186
+ return lora_request
187
+
188
+ @abstractmethod
189
+ def sample(
190
+ self,
191
+ tokenizer: TokenizerLike,
192
+ num_requests: int,
193
+ request_id_prefix: str = "",
194
+ no_oversample: bool = False,
195
+ ) -> list[SampleRequest]:
196
+ """
197
+ Abstract method to generate sample requests from the dataset.
198
+
199
+ Subclasses must override this method to implement dataset-specific logic
200
+ for generating a list of SampleRequest objects.
201
+
202
+ Args:
203
+ tokenizer (TokenizerLike): The tokenizer to be used
204
+ for processing the dataset's text.
205
+ num_requests (int): The number of sample requests to generate.
206
+ request_id_prefix (str): The prefix of request_id.
207
+
208
+ Returns:
209
+ list[SampleRequest]: A list of sample requests generated from the
210
+ dataset.
211
+ """
212
+ raise NotImplementedError("sample must be implemented in subclasses.")
213
+
214
+ def maybe_oversample_requests(
215
+ self,
216
+ requests: list[SampleRequest],
217
+ num_requests: int,
218
+ request_id_prefix: str = "",
219
+ no_oversample: bool = False,
220
+ ) -> None:
221
+ """
222
+ Oversamples the list of requests if its size is less than the desired
223
+ number.
224
+
225
+ Args:
226
+ requests (List[SampleRequest]): The current list of sampled
227
+ requests.
228
+ num_requests (int): The target number of requests.
229
+ request_id_prefix (str): The prefix applied to generated request
230
+ identifiers.
231
+
232
+ """
233
+ if no_oversample:
234
+ logger.info("Skipping oversampling. Total samples: %d.", len(requests))
235
+ return
236
+
237
+ if len(requests) < num_requests:
238
+ random.seed(self.random_seed)
239
+ needed = num_requests - len(requests)
240
+ additional = []
241
+ for i in range(needed):
242
+ req = deepcopy(random.choice(requests))
243
+ req.request_id = request_id_prefix + str(len(requests) + i)
244
+ additional.append(req)
245
+ requests.extend(additional)
246
+ logger.info("Oversampled requests to reach %d total samples.", num_requests)
247
+
248
+ ids = [req.request_id for req in requests]
249
+ if len(ids) != len(set(ids)):
250
+ raise ValueError(
251
+ "Duplicate request_id found in the sampled "
252
+ "requests. Please ensure that each request_id "
253
+ "is unique."
254
+ )
255
+
256
+
257
+ # -----------------------------------------------------------------------------
258
+ # Utility Functions and Global Caches
259
+ # -----------------------------------------------------------------------------
260
+
261
+
262
+ def is_valid_sequence(
263
+ prompt_len: int,
264
+ output_len: int,
265
+ min_len: int = 4,
266
+ max_prompt_len: int = 1024,
267
+ max_total_len: int = 2048,
268
+ skip_min_output_len_check: bool = False,
269
+ ) -> bool:
270
+ """
271
+ Validate a sequence based on prompt and output lengths.
272
+
273
+ Default pruning criteria are copied from the original `sample_hf_requests`
274
+ and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
275
+ from `sample_requests` in benchmark_throughput.py.
276
+ """
277
+ # Check for invalid conditions
278
+ prompt_too_short = prompt_len < min_len
279
+ output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
280
+ prompt_too_long = prompt_len > max_prompt_len
281
+ combined_too_long = (prompt_len + output_len) > max_total_len
282
+
283
+ # Return True if none of the invalid conditions are met
284
+ return not (
285
+ prompt_too_short or output_too_short or prompt_too_long or combined_too_long
286
+ )
287
+
288
+
289
+ @cache
290
+ def lora_path_on_disk(lora_path: str) -> str:
291
+ return get_adapter_absolute_path(lora_path)
292
+
293
+
294
+ # Global cache for LoRA tokenizers.
295
+ lora_tokenizer_cache: dict[int, TokenizerLike] = {}
296
+
297
+
298
+ def process_image(image: Any) -> Mapping[str, Any]:
299
+ """
300
+ Process a single image input and return a multimedia content dictionary.
301
+
302
+ Supports the following input types:
303
+
304
+ 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
305
+ containing raw image data. - Loads the bytes as a PIL.Image.Image.
306
+
307
+ 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
308
+ a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
309
+ a dictionary with the image as a base64 data URL.
310
+
311
+ 3. String input: - Treats the string as a URL or local file path. -
312
+ Prepends "file://" if the string doesn't start with "http://" or
313
+ "file://". - Returns a dictionary with the image URL.
314
+
315
+ Raises:
316
+ ValueError: If the input is not a supported type.
317
+ """
318
+ if isinstance(image, dict) and "bytes" in image:
319
+ image = Image.open(BytesIO(image["bytes"]))
320
+ if isinstance(image, Image.Image):
321
+ image = convert_image_mode(image, "RGB")
322
+ with io.BytesIO() as image_data:
323
+ image.save(image_data, format="JPEG")
324
+ image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
325
+ return {
326
+ "type": "image_url",
327
+ "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
328
+ }
329
+
330
+ if isinstance(image, str):
331
+ image_url = (
332
+ image
333
+ if image.startswith(("http://", "https://", "file://"))
334
+ else f"file://{image}"
335
+ )
336
+ return {"type": "image_url", "image_url": {"url": image_url}}
337
+
338
+ raise ValueError(
339
+ f"Invalid image input {image}. Must be a PIL.Image.Image"
340
+ " or str or dictionary with raw image bytes."
341
+ )
342
+
343
+
344
+ def process_video(video: Any) -> Mapping[str, Any]:
345
+ """
346
+ Process a single video input and return a multimedia content dictionary.
347
+
348
+ Supports the following input types:
349
+
350
+ 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
351
+ containing raw video data.
352
+
353
+ 2. String input: - Treats the string as a URL or local file path. -
354
+ Prepends "file://" if the string doesn't start with "http://" or
355
+ "file://". - Returns a dictionary with the image URL.
356
+
357
+ Raises:
358
+ ValueError: If the input is not a supported type.
359
+ """
360
+ if isinstance(video, dict) and "bytes" in video:
361
+ video_bytes = video["bytes"]
362
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
363
+ return {
364
+ "type": "video_url",
365
+ "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
366
+ }
367
+
368
+ if isinstance(video, str):
369
+ video_url = (
370
+ video
371
+ if video.startswith(("http://", "https://", "file://"))
372
+ else f"file://{video}"
373
+ )
374
+ return {"type": "video_url", "video_url": {"url": video_url}}
375
+
376
+ raise ValueError(
377
+ f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
378
+ )
379
+
380
+
381
+ def gen_prompt_decode_to_target_len(
382
+ tokenizer: TokenizerLike,
383
+ token_sequence: list[int],
384
+ target_token_len: int,
385
+ max_retry: int = 10,
386
+ add_special_tokens: bool = False,
387
+ rng: np.random.Generator | None = None,
388
+ ) -> tuple[str, list[int]]:
389
+ """
390
+ Ensure decoded-then-encoded prompt length matches the target token length.
391
+
392
+ This function decodes an initial token sequence to text and re-encodes it
393
+ , iteratively adjusting the token sequence length to match a target.
394
+ This is necessary because some tokenizers do not guarantee a 1:1 mapping
395
+ between consecutive tokens and the decoded-then-encoded sequence length.
396
+ For example, for GPT2Tokenizer:
397
+ [6880, 6881] -> ['Ġcalls', 'here'] ->
398
+ [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
399
+
400
+ Returns a tuple of the final prompt string and the adjusted token sequence.
401
+ """
402
+ remain_num_try = max_retry
403
+ token_mismatch = 0
404
+ while True:
405
+ prompt = tokenizer.decode(token_sequence)
406
+ token_sequence = tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
407
+ if remain_num_try <= 0:
408
+ if len(token_sequence) != target_token_len:
409
+ token_mismatch = len(token_sequence) - target_token_len
410
+ break
411
+
412
+ if len(token_sequence) == target_token_len:
413
+ break
414
+ elif len(token_sequence) < target_token_len:
415
+ if rng is not None:
416
+ extra_tokens = rng.integers(
417
+ 0,
418
+ tokenizer.vocab_size,
419
+ size=target_token_len - len(token_sequence),
420
+ ).tolist()
421
+ else:
422
+ extra_tokens = np.random.randint(
423
+ 0,
424
+ tokenizer.vocab_size,
425
+ size=target_token_len - len(token_sequence),
426
+ ).tolist()
427
+ token_sequence.extend(extra_tokens)
428
+ elif len(token_sequence) > target_token_len:
429
+ token_sequence = token_sequence[:target_token_len]
430
+
431
+ remain_num_try -= 1
432
+
433
+ return prompt, token_sequence, token_mismatch
434
+
435
+
436
+ # -----------------------------------------------------------------------------
437
+ # Random Dataset Implementation (Synthetic Data)
438
+ # -----------------------------------------------------------------------------
439
+
440
+
441
+ class RandomDataset(BenchmarkDataset):
442
+ """
443
+ Synthetic text-only dataset for serving/throughput benchmarks.
444
+
445
+ Strategy:
446
+ - Sample input/output token lengths per request from integer-uniform ranges
447
+ around configured means (controlled by range_ratio).
448
+ - Prepend a fixed random prefix of length prefix_len.
449
+ - Generate the remaining tokens as a reproducible sequence:
450
+ (offset + index + arange(input_len)) % vocab_size.
451
+ - Decode then re-encode/truncate to ensure prompt token counts match.
452
+ - Uses numpy.default_rng seeded with random_seed for reproducible sampling.
453
+ """
454
+
455
+ # Default values copied from benchmark_serving.py for the random dataset.
456
+ DEFAULT_PREFIX_LEN = 0
457
+ DEFAULT_RANGE_RATIO = 0.0
458
+ DEFAULT_INPUT_LEN = 1024
459
+ DEFAULT_OUTPUT_LEN = 128
460
+
461
+ def __init__(self, **kwargs) -> None:
462
+ super().__init__(**kwargs)
463
+ # Use numpy's default_rng for deterministic sampling
464
+ # Do not use random.seed() or np.random.seed() elsewhere in this class.
465
+ # This ensures that the RNG is isolated from global RNG state.
466
+ self._rng = np.random.default_rng(self.random_seed)
467
+
468
+ def sample(
469
+ self,
470
+ tokenizer: TokenizerLike,
471
+ num_requests: int,
472
+ request_id_prefix: str = "",
473
+ no_oversample: bool = False,
474
+ prefix_len: int = DEFAULT_PREFIX_LEN,
475
+ range_ratio: float = DEFAULT_RANGE_RATIO,
476
+ input_len: int = DEFAULT_INPUT_LEN,
477
+ output_len: int = DEFAULT_OUTPUT_LEN,
478
+ batchsize: int = 1,
479
+ **kwargs,
480
+ ) -> list[SampleRequest]:
481
+ # validate total input tokens (prefix + sampled) is at least 1.
482
+ num_special = int(tokenizer.num_special_tokens_to_add())
483
+ real_input_len = max(0, int(input_len) - num_special)
484
+ min_sampled_input = math.floor(real_input_len * (1.0 - float(range_ratio)))
485
+ min_total_input = int(prefix_len) + min_sampled_input
486
+ if min_total_input < 1:
487
+ raise ValueError(
488
+ "--random-input-len is too small: with tokenizer special "
489
+ f"tokens {num_special} and --random-range-ratio {range_ratio}, "
490
+ "the minimum possible total input tokens (prefix + sampled) is "
491
+ f"{min_total_input}. Increase --random-input-len and/or "
492
+ "--random-prefix-len, or decrease --random-range-ratio so that "
493
+ "prefix_len + floor(max(0, random_input_len - num_special)) "
494
+ "* (1 - range_ratio) >= 1."
495
+ )
496
+
497
+ input_lens, output_lens, offsets = self.get_sampling_params(
498
+ num_requests, range_ratio, input_len, output_len, tokenizer
499
+ )
500
+
501
+ vocab_size = tokenizer.vocab_size
502
+ prohibited_tokens = tokenizer.all_special_ids
503
+ all_tokens = np.arange(vocab_size)
504
+ allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
505
+
506
+ # Generate prefix once
507
+ prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len)
508
+
509
+ requests = []
510
+ token_mismatch_total = 0
511
+ for i in range(num_requests):
512
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
513
+ tokenizer=tokenizer,
514
+ prefix_token_ids=prefix_token_ids,
515
+ prefix_len=prefix_len,
516
+ vocab_size=vocab_size,
517
+ input_len=int(input_lens[i]),
518
+ offset=int(offsets[i]),
519
+ index=i,
520
+ allowed_tokens=allowed_tokens,
521
+ )
522
+ token_mismatch_total += token_mismatch
523
+ requests.append(
524
+ SampleRequest(
525
+ prompt=prompt,
526
+ prompt_len=total_input_len,
527
+ expected_output_len=int(output_lens[i]),
528
+ request_id=request_id_prefix + str(i),
529
+ )
530
+ )
531
+ # only used for embeddings benchmark.
532
+ if batchsize > 1:
533
+ batch_requests = []
534
+ # Create batched requests
535
+ for i in range(0, num_requests, batchsize):
536
+ batch = requests[i : i + batchsize]
537
+ batch_requests.append(
538
+ SampleRequest(
539
+ prompt=[req.prompt for req in batch],
540
+ prompt_len=sum(req.prompt_len for req in batch),
541
+ expected_output_len=0,
542
+ request_id=request_id_prefix + str(i // batchsize),
543
+ )
544
+ )
545
+ requests = batch_requests
546
+
547
+ if token_mismatch_total != 0:
548
+ sign = "more" if token_mismatch_total > 0 else "fewer"
549
+ logger.warning(
550
+ "Across all generated prompts, there were %d %s tokens "
551
+ "than expected after decoding and re-encoding. This is "
552
+ "expected due to the imperfect nature of the sampling "
553
+ "procedure.",
554
+ abs(token_mismatch_total),
555
+ sign,
556
+ )
557
+
558
+ return requests
559
+
560
+ def get_prefix(
561
+ self,
562
+ allowed_tokens: np.ndarray,
563
+ prefix_len: int,
564
+ ) -> list[int]:
565
+ """
566
+ Get the prefix for the dataset.
567
+ """
568
+ return (
569
+ allowed_tokens[
570
+ self._rng.integers(0, len(allowed_tokens), size=prefix_len)
571
+ ].tolist()
572
+ if prefix_len > 0
573
+ else []
574
+ )
575
+
576
+ def get_sampling_params(
577
+ self,
578
+ num_requests: int,
579
+ range_ratio: float,
580
+ input_len: int,
581
+ output_len: int,
582
+ tokenizer: TokenizerLike,
583
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
584
+ """
585
+ Get the sampling parameters for the dataset.
586
+ """
587
+ # Enforce range_ratio < 1
588
+ if not (0.0 <= range_ratio < 1.0):
589
+ raise ValueError("range_ratio must be in [0, 1).")
590
+ num_special_tokens = int(tokenizer.num_special_tokens_to_add())
591
+ real_input_len = max(0, int(input_len) - num_special_tokens)
592
+ # Bounds use floor for low and ceil for high
593
+ input_low = math.floor(real_input_len * (1 - range_ratio))
594
+ input_high = math.ceil(real_input_len * (1 + range_ratio))
595
+ output_low = math.floor(output_len * (1 - range_ratio))
596
+ output_high = math.ceil(output_len * (1 + range_ratio))
597
+ # Ensure the lower bound for output length is at least 1 to
598
+ # prevent sampling 0 tokens.
599
+ output_low = max(output_low, 1)
600
+ output_high = max(output_high, 1)
601
+
602
+ if input_low > input_high:
603
+ raise ValueError(
604
+ f"Invalid input sampling interval: low={input_low} > high={input_high}"
605
+ )
606
+ if output_low > output_high:
607
+ raise ValueError(
608
+ "Invalid output sampling interval: "
609
+ f"low={output_low} > high={output_high}"
610
+ )
611
+
612
+ logger.info(
613
+ "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
614
+ input_low,
615
+ input_high,
616
+ output_low,
617
+ output_high,
618
+ )
619
+
620
+ input_lens = self._rng.integers(input_low, input_high + 1, size=num_requests)
621
+ output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests)
622
+ offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests)
623
+ return input_lens, output_lens, offsets
624
+
625
+ def generate_token_sequence(
626
+ self,
627
+ *,
628
+ tokenizer: TokenizerLike,
629
+ prefix_token_ids: list[int],
630
+ prefix_len: int,
631
+ vocab_size: int,
632
+ input_len: int,
633
+ offset: int,
634
+ index: int,
635
+ allowed_tokens: np.ndarray,
636
+ ) -> tuple[str, int, int]:
637
+ """
638
+ Returns (prompt, total_input_len).
639
+
640
+ NOTE: After decoding the prompt we have to encode and decode it again.
641
+ This is done because in some cases N consecutive tokens
642
+ give a string tokenized into != N number of tokens.
643
+ For example for GPT2Tokenizer:
644
+ [6880, 6881] -> ['Ġcalls', 'here'] ->
645
+ [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
646
+ To avoid uncontrolled change of the prompt length,
647
+ the encoded sequence is truncated before being decoded again.
648
+ """
649
+ # Build the inner sequence by sampling
650
+ # sequentially from the allowed tokens
651
+ inner_seq = allowed_tokens[
652
+ (offset + index + np.arange(input_len)) % len(allowed_tokens)
653
+ ].tolist()
654
+ token_sequence = prefix_token_ids + inner_seq
655
+
656
+ # Decode, then re-encode and truncate to preserve token count invariants
657
+ total_input_len = prefix_len + int(input_len)
658
+ prompt, adjusted_token_sequence, token_mismatch = (
659
+ gen_prompt_decode_to_target_len(
660
+ tokenizer=tokenizer,
661
+ token_sequence=token_sequence,
662
+ target_token_len=total_input_len,
663
+ add_special_tokens=False,
664
+ rng=self._rng,
665
+ )
666
+ )
667
+ total_input_len = len(adjusted_token_sequence)
668
+ return prompt, total_input_len, token_mismatch
669
+
670
+
671
+ # -----------------------------------------------------------------------------
672
+ # Random Dataset Implementation (Synthetic Data)
673
+ # -----------------------------------------------------------------------------
674
+
675
+
676
+ class RandomDatasetForReranking(RandomDataset):
677
+ """
678
+ Random dataset specialized for the needs of scoring:
679
+ - Batches of inputs
680
+ - Inputs composed of pairs
681
+ """
682
+
683
+ def __init__(self, **kwargs) -> None:
684
+ super().__init__(**kwargs)
685
+
686
+ def sample(
687
+ self,
688
+ tokenizer: TokenizerLike,
689
+ num_requests: int,
690
+ request_id_prefix: str = "",
691
+ range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
692
+ input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
693
+ batchsize: int = 1,
694
+ is_reranker: bool = True,
695
+ **kwargs,
696
+ ) -> list[SampleRequest]:
697
+ n_sep_tokens = int(is_reranker)
698
+
699
+ query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len
700
+
701
+ query_lens, _, query_offsets = self.get_sampling_params(
702
+ 1, range_ratio, query_len_param, 0, tokenizer
703
+ )
704
+
705
+ query_len = int(query_lens[0])
706
+
707
+ if not is_reranker:
708
+ assert num_requests > 1 and batchsize > 1
709
+ num_requests -= 1
710
+ batchsize -= 1
711
+ doc_len_param = input_len
712
+ else:
713
+ doc_len_param = input_len - query_len - n_sep_tokens
714
+
715
+ doc_lens, _, doc_offsets = self.get_sampling_params(
716
+ num_requests, range_ratio, doc_len_param, 0, tokenizer
717
+ )
718
+
719
+ vocab_size = tokenizer.vocab_size
720
+ prohibited_tokens = tokenizer.all_special_ids
721
+ all_tokens = np.arange(vocab_size)
722
+ allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
723
+
724
+ query_prompt, query_input_len, token_mismatch_total = (
725
+ self.generate_token_sequence(
726
+ tokenizer=tokenizer,
727
+ prefix_token_ids=[],
728
+ prefix_len=0,
729
+ vocab_size=vocab_size,
730
+ input_len=query_len,
731
+ offset=int(query_offsets[0]),
732
+ index=0,
733
+ allowed_tokens=allowed_tokens,
734
+ )
735
+ )
736
+
737
+ requests = []
738
+ for i in range(num_requests):
739
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
740
+ tokenizer=tokenizer,
741
+ prefix_token_ids=[],
742
+ prefix_len=0,
743
+ vocab_size=vocab_size,
744
+ input_len=int(doc_lens[i]),
745
+ offset=int(doc_offsets[i]),
746
+ index=i + 1,
747
+ allowed_tokens=allowed_tokens,
748
+ )
749
+ token_mismatch_total += token_mismatch
750
+ requests.append((prompt, total_input_len))
751
+
752
+ batch_requests = []
753
+ # Create batched requests
754
+ for i in range(0, num_requests, batchsize):
755
+ batch = requests[i : i + batchsize]
756
+ query_contrib = (
757
+ (query_input_len + n_sep_tokens) * len(batch)
758
+ if is_reranker
759
+ else query_input_len
760
+ )
761
+ batch_requests.append(
762
+ SampleRequest(
763
+ prompt=[query_prompt] + [req[0] for req in batch],
764
+ prompt_len=query_contrib + sum(req[1] for req in batch),
765
+ expected_output_len=0,
766
+ request_id=request_id_prefix + str(i // batchsize),
767
+ )
768
+ )
769
+
770
+ if token_mismatch_total != 0:
771
+ logger.warning(
772
+ "Across all generated prompts, there were %d %s tokens "
773
+ "than expected after decoding and re-encoding. This is "
774
+ "expected due to the imperfect nature of the sampling "
775
+ "procedure.",
776
+ abs(token_mismatch_total),
777
+ "more" if token_mismatch_total > 0 else "fewer",
778
+ )
779
+
780
+ return batch_requests
781
+
782
+
783
+ # -----------------------------------------------------------------------------
784
+ # MultiModalDataset Implementation
785
+ # -----------------------------------------------------------------------------
786
+
787
+
788
+ class RandomMultiModalDataset(RandomDataset):
789
+ """
790
+ Synthetic multimodal dataset (text + images) that extends RandomDataset.
791
+
792
+ Status:
793
+ - Images: supported via synthetic RGB data.
794
+ - Video: supported via synthetic RGB data.
795
+ - Audio: not yet supported.
796
+
797
+ Sampling overview:
798
+ 1) Number of items per request is sampled uniformly from the integer range
799
+ [floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is
800
+ `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0.
801
+ The maximum is further clamped to the sum of per-modality limits.
802
+ 2) Each item’s modality and shape is sampled from `bucket_config`, a dict
803
+ mapping (height, width, num_frames) → probability. We treat
804
+ `num_frames`=1 as image and `num_frames` > 1 as video.
805
+ Entries with zero probability are removed and the rest are renormalized
806
+ to sum to 1.
807
+ 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
808
+ When a modality reaches its cap, all of its buckets are excluded and the
809
+ remaining probabilities are renormalized.
810
+
811
+ Example bucket configuration:
812
+ {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1}
813
+ - Two image buckets (`num_frames`=1) and one video bucket
814
+ (`num_frames`=16).
815
+ OBS.: Only image sampling is supported for now.
816
+ """
817
+
818
+ IS_MULTIMODAL = True
819
+ DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 1}
820
+
821
+ DEFAULT_BASE_ITEMS_PER_REQUEST = 1
822
+ DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0
823
+ DEFAULT_MM_ITEM_BUCKET_CONFIG = {
824
+ (256, 256, 1): 0.5,
825
+ (720, 1280, 1): 0.5,
826
+ (720, 1280, 16): 0.0,
827
+ }
828
+ DEFAULT_ENABLE_MULTIMODAL_CHAT = False
829
+
830
+ def __init__(self, **kwargs) -> None:
831
+ super().__init__(**kwargs)
832
+
833
+ def generate_synthetic_image(self, width: int, height: int) -> Image.Image:
834
+ """Generate synthetic PIL image with random RGB values.
835
+
836
+ NOTE: iid pixel sampling results in worst-case compression
837
+ (good for stressing I/O), but very unlike real photos.
838
+ We could consider a “low-freq” mode (e.g., noise blur)
839
+ to emulate network realism instead of max stress.
840
+ """
841
+ random_pixels = self._rng.integers(
842
+ 0,
843
+ 256,
844
+ (height, width, 3),
845
+ dtype=np.uint8,
846
+ )
847
+ return Image.fromarray(random_pixels)
848
+
849
+ def generate_synthetic_video(
850
+ self, width: int, height: int, num_frames: int
851
+ ) -> dict:
852
+ """Generate synthetic video with random values.
853
+
854
+ Creates a video with random pixel values, encodes it to MP4 format,
855
+ and returns the content as bytes.
856
+ """
857
+ import cv2
858
+
859
+ random_pixels = self._rng.integers(
860
+ 0,
861
+ 256,
862
+ (num_frames, height, width, 3),
863
+ dtype=np.uint8,
864
+ )
865
+
866
+ # Create a temporary video file in memory
867
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
868
+ fps = 30 # frames per second
869
+
870
+ with NamedTemporaryFile(suffix=".mp4", delete_on_close=False) as temp_file:
871
+ temp_path = temp_file.name
872
+
873
+ # Create video writer
874
+ video_writer = cv2.VideoWriter(
875
+ temp_path, fourcc=fourcc, fps=fps, frameSize=(width, height)
876
+ )
877
+
878
+ if not video_writer.isOpened():
879
+ raise RuntimeError("Failed to create video writer")
880
+
881
+ for frame in random_pixels:
882
+ video_writer.write(frame)
883
+
884
+ video_writer.release()
885
+ temp_file.close()
886
+
887
+ # Read the video file content
888
+ with open(temp_path, "rb") as f:
889
+ video_content = f.read()
890
+
891
+ return {"bytes": video_content}
892
+
893
+ def map_config_to_modality(self, config: tuple[int, int, int]) -> str:
894
+ """Map the configuration to the modality."""
895
+ if config[-1] == 1:
896
+ return "image"
897
+ elif config[-1] > 1:
898
+ return "video"
899
+ else:
900
+ raise ValueError(f"Invalid multimodal item configuration: {config}")
901
+
902
+ def normalize_bucket_config(
903
+ self, bucket_config: dict[tuple[int, int, int], float]
904
+ ) -> dict[tuple[int, int, int], float]:
905
+ """
906
+ Remove zero probability entries
907
+ and normalize the bucket config to sum to 1.
908
+ """
909
+ # Raise error if value is negative
910
+ if any(v < 0 for v in bucket_config.values()):
911
+ raise ValueError("Bucket config values must be non-negative.")
912
+ # Remove zero probability entries
913
+ bucket_config = {k: v for k, v in bucket_config.items() if v > 0}
914
+ # if bucket config is empty, raise error
915
+ if not bucket_config:
916
+ raise ValueError(
917
+ "Got invalid bucket config. Bucket config values must be non-zero."
918
+ )
919
+ # Normalize the remaining bucket config to sum to 1
920
+ total = sum(bucket_config.values())
921
+ return {k: v / total for k, v in bucket_config.items()}
922
+
923
+ def generate_mm_item(
924
+ self,
925
+ mm_item_config: tuple[int, int, int],
926
+ ) -> Mapping[str, Any]:
927
+ """
928
+ Create synthetic images and videos and
929
+ apply process_image/process_video respectively.
930
+ This follows the OpenAI API chat completions
931
+ https://github.com/openai/openai-python
932
+ """
933
+
934
+ if self.map_config_to_modality(mm_item_config) == "image":
935
+ return process_image(
936
+ self.generate_synthetic_image(mm_item_config[1], mm_item_config[0])
937
+ )
938
+ elif self.map_config_to_modality(mm_item_config) == "video":
939
+ return process_video(
940
+ self.generate_synthetic_video(
941
+ mm_item_config[1], mm_item_config[0], mm_item_config[2]
942
+ )
943
+ )
944
+ else:
945
+ raise ValueError(f"Invalid multimodal item configuration: {mm_item_config}")
946
+
947
+ def get_mm_item_sampling_params(
948
+ self,
949
+ base_items_per_request: int,
950
+ num_mm_items_range_ratio: float,
951
+ limit_mm_per_prompt: dict[str, int],
952
+ bucket_config: dict[tuple[int, int, int], float],
953
+ ) -> tuple[int, int, dict[str, int], dict[tuple[int, int, int], float]]:
954
+ """
955
+ Get the sampling parameters for the multimodal items.
956
+ """
957
+ # Enforce num_mm_items_range_ratio <= 1
958
+ if not (0.0 <= num_mm_items_range_ratio <= 1.0):
959
+ raise ValueError("num_mm_items_range_ratio must be in [0, 1].")
960
+
961
+ # Ensure modalities to sample are in limit_mm_per_prompt
962
+ for k, v in bucket_config.items():
963
+ # get modality from bucket config
964
+ modality = self.map_config_to_modality(k)
965
+ if modality not in limit_mm_per_prompt:
966
+ raise ValueError(
967
+ f"Modality {modality} is not in "
968
+ f"limit_mm_per_prompt: "
969
+ f"{limit_mm_per_prompt.keys()}"
970
+ )
971
+
972
+ # Remove zero probability entries
973
+ # and normalize bucket config to sum to 1
974
+ bucket_config = self.normalize_bucket_config(bucket_config)
975
+ logger.info(
976
+ "Normalized bucket config: %s",
977
+ bucket_config,
978
+ )
979
+ # Only consider limit per prompt for modalities in bucket config
980
+ allowed_modalities = {self.map_config_to_modality(cfg) for cfg in bucket_config}
981
+ limit_mm_per_prompt = {
982
+ k: v for k, v in limit_mm_per_prompt.items() if k in allowed_modalities
983
+ }
984
+ if not limit_mm_per_prompt:
985
+ raise ValueError("No valid limits for modalities present in bucket_config.")
986
+
987
+ logger.info(
988
+ "Updated mm-limit-per-prompt: %s",
989
+ limit_mm_per_prompt,
990
+ )
991
+
992
+ # Get max and min num mm items and ensure
993
+ # it is at most the sum of limit_mm_per_prompt for all modalities
994
+ max_num_mm_items = min(
995
+ sum(limit_mm_per_prompt.values()),
996
+ math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio)),
997
+ )
998
+ # Ensure min num mm items is at least 0
999
+ min_num_mm_items = max(
1000
+ 0, math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
1001
+ )
1002
+ # Raise error if min num mm items is greater than max num mm items
1003
+ if min_num_mm_items > max_num_mm_items:
1004
+ raise ValueError(
1005
+ f"Min num mm items is greater than max mm items: "
1006
+ f"{min_num_mm_items} > {max_num_mm_items}"
1007
+ )
1008
+
1009
+ logger.info(
1010
+ "Sampling number of multimodal items from [%s, %s]",
1011
+ min_num_mm_items,
1012
+ max_num_mm_items,
1013
+ )
1014
+
1015
+ return (
1016
+ min_num_mm_items,
1017
+ max_num_mm_items,
1018
+ limit_mm_per_prompt,
1019
+ bucket_config,
1020
+ )
1021
+
1022
+ def get_mm_item_iterator(
1023
+ self,
1024
+ min_num_mm_items: int,
1025
+ max_num_mm_items: int,
1026
+ bucket_config: dict[tuple[int, int, int], float],
1027
+ limit_mm_per_prompt: dict[str, int],
1028
+ ) -> Iterator[tuple[int, int, int]]:
1029
+ """
1030
+ Iterator over the multimodal items for each request
1031
+ whose size is between min_num_mm_items and max_num_mm_items.
1032
+
1033
+ Loop over the bucket config and sample a multimodal item.
1034
+ Loop until the number of multimodal items sampled is equal to
1035
+ request_num_mm_items or limit of multimodal items per prompt
1036
+ for all modalities is reached.
1037
+
1038
+ Note:
1039
+ - This function operates on a per-request shallow copy of
1040
+ `bucket_config` (tuple->float). The original dict passed to
1041
+ `sample` is not mutated. If this ever changes, a test
1042
+ is implemented and will fail.
1043
+ """
1044
+ # Get the number of multimodal items to sample
1045
+ request_num_mm_items = int(
1046
+ self._rng.integers(min_num_mm_items, max_num_mm_items + 1)
1047
+ )
1048
+ # If request_num_mm_items is 0, yield an empty iterator
1049
+ if request_num_mm_items == 0:
1050
+ return
1051
+ # Initialize modality counters
1052
+ modality_counter = {self.map_config_to_modality(k): 0 for k in bucket_config}
1053
+ # Copy the bucket config to avoid modifying the original
1054
+ bucket_config_copy = bucket_config.copy()
1055
+ # Loop over the number of multimodal items to sample
1056
+ while sum(modality_counter.values()) < request_num_mm_items:
1057
+ # Sample a multimodal item config
1058
+ mm_item_config = self._rng.choice(
1059
+ list(bucket_config_copy.keys()), p=list(bucket_config_copy.values())
1060
+ )
1061
+ modality = self.map_config_to_modality(mm_item_config)
1062
+ # Check that modality count is less than limit per prompt
1063
+ if modality_counter[modality] < limit_mm_per_prompt[modality]:
1064
+ modality_counter[modality] += 1
1065
+ yield (mm_item_config)
1066
+ else:
1067
+ # If the counter is greater than the limit per prompt
1068
+ # set all multimodal items of this modality to 0
1069
+ for k, v in bucket_config_copy.items():
1070
+ if self.map_config_to_modality(k) == modality:
1071
+ bucket_config_copy[k] = 0
1072
+ # If all configs are 0, break the loop
1073
+ # This should not happen as request_num_mm_items is at most
1074
+ # the sum of limit_mm_per_prompt for all modalities
1075
+ if all(v == 0 for v in bucket_config_copy.values()):
1076
+ logger.warning(
1077
+ "Exhausted all multimodal items of modality %s", modality
1078
+ )
1079
+ break
1080
+ # Renormalize the bucket config
1081
+ bucket_config_copy = self.normalize_bucket_config(bucket_config_copy)
1082
+
1083
+ def sample(
1084
+ self,
1085
+ tokenizer: TokenizerLike,
1086
+ num_requests: int,
1087
+ request_id_prefix: str = "",
1088
+ no_oversample: bool = False,
1089
+ prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
1090
+ range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
1091
+ input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
1092
+ output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN,
1093
+ limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
1094
+ base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
1095
+ num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
1096
+ bucket_config: dict[
1097
+ tuple[int, int, int], float
1098
+ ] = DEFAULT_MM_ITEM_BUCKET_CONFIG,
1099
+ enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
1100
+ **kwargs,
1101
+ ) -> list[SampleRequest]:
1102
+ # Get the sampling parameters for the dataset
1103
+ input_lens, output_lens, offsets = self.get_sampling_params(
1104
+ num_requests, range_ratio, input_len, output_len, tokenizer
1105
+ )
1106
+
1107
+ (
1108
+ min_num_mm_items,
1109
+ max_num_mm_items,
1110
+ limit_mm_per_prompt,
1111
+ bucket_config,
1112
+ ) = self.get_mm_item_sampling_params(
1113
+ base_items_per_request,
1114
+ num_mm_items_range_ratio,
1115
+ limit_mm_per_prompt,
1116
+ bucket_config,
1117
+ )
1118
+
1119
+ vocab_size = tokenizer.vocab_size
1120
+ # Can't use tokenizer.all_special_ids since
1121
+ # it returns ONLY ids from special_tokens_map.json
1122
+ # We want to exclude placeholder tokens and all
1123
+ # tokens that indicate start/end of image as it
1124
+ # may break prompt replacement logic.
1125
+ prohibited_tokens = list(
1126
+ tok_id
1127
+ for tok_id, token in tokenizer.added_tokens_decoder.items()
1128
+ if token.special
1129
+ )
1130
+ all_tokens = np.arange(vocab_size)
1131
+ allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
1132
+ logger.debug(
1133
+ "Sampling from %d out of %d (vocab size)", len(allowed_tokens), vocab_size
1134
+ )
1135
+ # Generate prefix once
1136
+ prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len)
1137
+ # Add synthetic multimodal items to each request
1138
+ mm_requests = []
1139
+ token_mismatch_total = 0
1140
+ for i in range(num_requests):
1141
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
1142
+ tokenizer=tokenizer,
1143
+ prefix_token_ids=prefix_token_ids,
1144
+ prefix_len=prefix_len,
1145
+ vocab_size=vocab_size,
1146
+ input_len=int(input_lens[i]),
1147
+ offset=int(offsets[i]),
1148
+ index=i,
1149
+ allowed_tokens=allowed_tokens,
1150
+ )
1151
+ token_mismatch_total += token_mismatch
1152
+ # Get multimodal item iterator for a given request
1153
+ mm_item_iterator = self.get_mm_item_iterator(
1154
+ min_num_mm_items,
1155
+ max_num_mm_items,
1156
+ bucket_config,
1157
+ limit_mm_per_prompt,
1158
+ )
1159
+
1160
+ mm_content = cast(
1161
+ list[dict[str, Any]],
1162
+ [
1163
+ self.generate_mm_item(mm_item_config)
1164
+ for mm_item_config in mm_item_iterator
1165
+ ],
1166
+ )
1167
+
1168
+ if enable_multimodal_chat:
1169
+ # NOTE: For now this option is only provided for completeness
1170
+ # given that the serve.py benchmark currently does not use it.
1171
+ mm_chat_prompt: Any = prompt
1172
+ mm_chat_prompt = self.apply_multimodal_chat_transformation(
1173
+ prompt, mm_content
1174
+ )
1175
+ sample_request = SampleRequest(
1176
+ prompt=mm_chat_prompt,
1177
+ prompt_len=total_input_len,
1178
+ expected_output_len=int(output_lens[i]),
1179
+ multi_modal_data=None,
1180
+ request_id=request_id_prefix + str(i),
1181
+ )
1182
+ else:
1183
+ sample_request = SampleRequest(
1184
+ prompt=prompt,
1185
+ prompt_len=total_input_len,
1186
+ expected_output_len=int(output_lens[i]),
1187
+ multi_modal_data=mm_content,
1188
+ request_id=request_id_prefix + str(i),
1189
+ )
1190
+ mm_requests.append(sample_request)
1191
+
1192
+ if token_mismatch_total != 0:
1193
+ sign = "more" if token_mismatch_total > 0 else "fewer"
1194
+ logger.warning(
1195
+ "Across all generated prompts, there were %d %s tokens "
1196
+ "than expected after decoding and re-encoding. This is "
1197
+ "expected due to the imperfect nature of the sampling "
1198
+ "procedure.",
1199
+ abs(token_mismatch_total),
1200
+ sign,
1201
+ )
1202
+
1203
+ return mm_requests
1204
+
1205
+
1206
+ # -----------------------------------------------------------------------------
1207
+ # ShareGPT Dataset Implementation
1208
+ # -----------------------------------------------------------------------------
1209
+
1210
+
1211
+ class ShareGPTDataset(BenchmarkDataset):
1212
+ """
1213
+ Implements the ShareGPT dataset. Loads data from a JSON file and generates
1214
+ sample requests based on conversation turns.
1215
+ """
1216
+
1217
+ def __init__(self, **kwargs) -> None:
1218
+ super().__init__(**kwargs)
1219
+ self.load_data()
1220
+
1221
+ def load_data(self) -> None:
1222
+ if self.dataset_path is None:
1223
+ raise ValueError("dataset_path must be provided for loading data.")
1224
+
1225
+ with open(self.dataset_path, encoding="utf-8") as f:
1226
+ self.data = json.load(f)
1227
+ # Filter entries with at least two conversation turns.
1228
+ self.data = [
1229
+ entry
1230
+ for entry in self.data
1231
+ if "conversations" in entry and len(entry["conversations"]) >= 2
1232
+ ]
1233
+ random.seed(self.random_seed)
1234
+ if not getattr(self, "disable_shuffle", False):
1235
+ random.shuffle(self.data)
1236
+
1237
+ def sample(
1238
+ self,
1239
+ tokenizer: TokenizerLike,
1240
+ num_requests: int,
1241
+ lora_path: str | None = None,
1242
+ max_loras: int | None = None,
1243
+ output_len: int | None = None,
1244
+ enable_multimodal_chat: bool = False,
1245
+ request_id_prefix: str = "",
1246
+ no_oversample: bool = False,
1247
+ **kwargs,
1248
+ ) -> list:
1249
+ samples: list = []
1250
+ ind = 0
1251
+ for entry in self.data:
1252
+ if len(samples) >= num_requests:
1253
+ break
1254
+ prompt, completion = (
1255
+ entry["conversations"][0]["value"],
1256
+ entry["conversations"][1]["value"],
1257
+ )
1258
+
1259
+ lora_request = self.get_random_lora_request(
1260
+ max_loras=max_loras, lora_path=lora_path
1261
+ )
1262
+ prompt_ids = tokenizer(prompt).input_ids
1263
+ completion_ids = tokenizer(completion).input_ids
1264
+ prompt_len = len(prompt_ids)
1265
+ new_output_len = len(completion_ids) if output_len is None else output_len
1266
+ if not is_valid_sequence(
1267
+ prompt_len,
1268
+ new_output_len,
1269
+ skip_min_output_len_check=output_len is not None,
1270
+ ):
1271
+ continue
1272
+ if image_path := entry.get("image"):
1273
+ mm_content = process_image(image_path)
1274
+ elif video_path := entry.get("video"):
1275
+ mm_content = process_video(video_path)
1276
+ else:
1277
+ mm_content = None
1278
+ if enable_multimodal_chat:
1279
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
1280
+ samples.append(
1281
+ SampleRequest(
1282
+ prompt=prompt,
1283
+ prompt_len=prompt_len,
1284
+ expected_output_len=new_output_len,
1285
+ lora_request=lora_request,
1286
+ multi_modal_data=mm_content,
1287
+ request_id=request_id_prefix + str(ind),
1288
+ )
1289
+ )
1290
+ ind += 1
1291
+ self.maybe_oversample_requests(
1292
+ samples, num_requests, request_id_prefix, no_oversample
1293
+ )
1294
+ return samples
1295
+
1296
+
1297
+ class _ValidateDatasetArgs(argparse.Action):
1298
+ """Argparse action to validate dataset name and path compatibility."""
1299
+
1300
+ def __call__(self, parser, namespace, values, option_string=None):
1301
+ setattr(namespace, self.dest, values)
1302
+
1303
+ # Get current values of both dataset_name and dataset_path
1304
+ dataset_name = getattr(namespace, "dataset_name", "random")
1305
+ dataset_path = getattr(namespace, "dataset_path", None)
1306
+
1307
+ # Validate the combination
1308
+ if dataset_name == "random" and dataset_path is not None:
1309
+ parser.error(
1310
+ "Cannot use 'random' dataset with --dataset-path. "
1311
+ "Please specify the appropriate --dataset-name (e.g., "
1312
+ "'sharegpt', 'custom', 'sonnet') for your dataset file: "
1313
+ f"{dataset_path}"
1314
+ )
1315
+
1316
+
1317
+ def add_dataset_parser(parser: FlexibleArgumentParser):
1318
+ parser.add_argument("--seed", type=int, default=0)
1319
+ parser.add_argument(
1320
+ "--num-prompts",
1321
+ type=int,
1322
+ default=1000,
1323
+ help="Number of prompts to process.",
1324
+ )
1325
+ parser.add_argument(
1326
+ "--dataset-name",
1327
+ type=str,
1328
+ default="random",
1329
+ action=_ValidateDatasetArgs,
1330
+ choices=[
1331
+ "sharegpt",
1332
+ "burstgpt",
1333
+ "sonnet",
1334
+ "random",
1335
+ "random-mm",
1336
+ "random-rerank",
1337
+ "hf",
1338
+ "custom",
1339
+ "prefix_repetition",
1340
+ "spec_bench",
1341
+ ],
1342
+ help="Name of the dataset to benchmark on.",
1343
+ )
1344
+ parser.add_argument(
1345
+ "--no-stream",
1346
+ action="store_true",
1347
+ help="Do not load the dataset in streaming mode.",
1348
+ )
1349
+ parser.add_argument(
1350
+ "--dataset-path",
1351
+ type=str,
1352
+ default=None,
1353
+ action=_ValidateDatasetArgs,
1354
+ help="Path to the sharegpt/sonnet dataset. "
1355
+ "Or the huggingface dataset ID if using HF dataset.",
1356
+ )
1357
+ parser.add_argument(
1358
+ "--no-oversample",
1359
+ action="store_true",
1360
+ help="Do not oversample if the dataset has fewer samples than num-prompts.",
1361
+ )
1362
+ parser.add_argument(
1363
+ "--skip-chat-template",
1364
+ action="store_true",
1365
+ help="Skip applying chat template to prompt for datasets that support it.",
1366
+ )
1367
+ parser.add_argument(
1368
+ "--disable-shuffle",
1369
+ action="store_true",
1370
+ help="Disable shuffling of dataset samples for deterministic ordering.",
1371
+ )
1372
+
1373
+ # group for dataset specific arguments
1374
+ custom_group = parser.add_argument_group("custom dataset options")
1375
+ custom_group.add_argument(
1376
+ "--custom-output-len",
1377
+ type=int,
1378
+ default=256,
1379
+ help="Number of output tokens per request, used only for custom dataset.",
1380
+ )
1381
+
1382
+ spec_bench_group = parser.add_argument_group("spec bench dataset options")
1383
+ spec_bench_group.add_argument(
1384
+ "--spec-bench-output-len",
1385
+ type=int,
1386
+ default=256,
1387
+ help="Num of output tokens per request, used only for spec bench dataset.",
1388
+ )
1389
+ spec_bench_group.add_argument(
1390
+ "--spec-bench-category",
1391
+ type=str,
1392
+ default=None,
1393
+ help="Category for spec bench dataset. If None, use all categories.",
1394
+ )
1395
+
1396
+ sonnet_group = parser.add_argument_group("sonnet dataset options")
1397
+ sonnet_group.add_argument(
1398
+ "--sonnet-input-len",
1399
+ type=int,
1400
+ default=550,
1401
+ help="Number of input tokens per request, used only for sonnet dataset.",
1402
+ )
1403
+ sonnet_group.add_argument(
1404
+ "--sonnet-output-len",
1405
+ type=int,
1406
+ default=150,
1407
+ help="Number of output tokens per request, used only for sonnet dataset.",
1408
+ )
1409
+ sonnet_group.add_argument(
1410
+ "--sonnet-prefix-len",
1411
+ type=int,
1412
+ default=200,
1413
+ help="Number of prefix tokens per request, used only for sonnet dataset.",
1414
+ )
1415
+
1416
+ sharegpt_group = parser.add_argument_group("sharegpt dataset options")
1417
+ sharegpt_group.add_argument(
1418
+ "--sharegpt-output-len",
1419
+ type=int,
1420
+ default=None,
1421
+ help="Output length for each request. Overrides the output length "
1422
+ "from the ShareGPT dataset.",
1423
+ )
1424
+
1425
+ blazedit_group = parser.add_argument_group("blazedit dataset options")
1426
+ blazedit_group.add_argument(
1427
+ "--blazedit-min-distance",
1428
+ type=float,
1429
+ default=0.0,
1430
+ help="Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
1431
+ )
1432
+ blazedit_group.add_argument(
1433
+ "--blazedit-max-distance",
1434
+ type=float,
1435
+ default=1.0,
1436
+ help="Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
1437
+ )
1438
+
1439
+ random_group = parser.add_argument_group("random dataset options")
1440
+ random_group.add_argument(
1441
+ "--random-input-len",
1442
+ type=int,
1443
+ default=1024,
1444
+ help="Number of input tokens per request, used only for random sampling.",
1445
+ )
1446
+ random_group.add_argument(
1447
+ "--random-output-len",
1448
+ type=int,
1449
+ default=128,
1450
+ help="Number of output tokens per request, used only for random sampling.",
1451
+ )
1452
+ random_group.add_argument(
1453
+ "--random-range-ratio",
1454
+ type=float,
1455
+ default=0.0,
1456
+ help="Range ratio for sampling input/output length, "
1457
+ "used only for random sampling. Must be in the range [0, 1) to define "
1458
+ "a symmetric sampling range"
1459
+ "[length * (1 - range_ratio), length * (1 + range_ratio)].",
1460
+ )
1461
+ random_group.add_argument(
1462
+ "--random-prefix-len",
1463
+ type=int,
1464
+ default=0,
1465
+ help=(
1466
+ "Number of fixed prefix tokens before the random context "
1467
+ "in a request. "
1468
+ "The total input length is the sum of `random-prefix-len` and "
1469
+ "a random "
1470
+ "context length sampled from [input_len * (1 - range_ratio), "
1471
+ "input_len * (1 + range_ratio)]."
1472
+ ),
1473
+ )
1474
+ random_group.add_argument(
1475
+ "--random-batch-size",
1476
+ type=int,
1477
+ default=1,
1478
+ help=("Batch size for random sampling. Only used for embeddings benchmark."),
1479
+ )
1480
+ random_group.add_argument(
1481
+ "--no-reranker",
1482
+ action="store_true",
1483
+ help=(
1484
+ "Whether the model supports reranking natively."
1485
+ " Only used for reranker benchmark."
1486
+ ),
1487
+ )
1488
+
1489
+ # random multimodal dataset options
1490
+ random_mm_group = parser.add_argument_group(
1491
+ "random multimodal dataset options extended from random dataset"
1492
+ )
1493
+ random_mm_group.add_argument(
1494
+ "--random-mm-base-items-per-request",
1495
+ type=int,
1496
+ default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST,
1497
+ help=(
1498
+ "Base number of multimodal items per request for random-mm. "
1499
+ "Actual per-request count is sampled around this base using "
1500
+ "--random-mm-num-mm-items-range-ratio."
1501
+ ),
1502
+ )
1503
+ random_mm_group.add_argument(
1504
+ "--random-mm-num-mm-items-range-ratio",
1505
+ type=float,
1506
+ default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
1507
+ help=(
1508
+ "Range ratio r in [0, 1] for sampling items per request. "
1509
+ "We sample uniformly from the closed integer range "
1510
+ "[floor(n*(1-r)), ceil(n*(1+r))] "
1511
+ "where n is the base items per request. "
1512
+ "r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped "
1513
+ "to the sum of per-modality limits from "
1514
+ "--random-mm-limit-mm-per-prompt. "
1515
+ "An error is raised if the computed min exceeds the max."
1516
+ ),
1517
+ )
1518
+ random_mm_group.add_argument(
1519
+ "--random-mm-limit-mm-per-prompt",
1520
+ type=json.loads,
1521
+ default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT,
1522
+ help=(
1523
+ "Per-modality hard caps for items attached per request, e.g. "
1524
+ '\'{"image": 3, "video": 0}\'. The sampled per-request item '
1525
+ "count is clamped to the sum of these limits. When a modality "
1526
+ "reaches its cap, its buckets are excluded and probabilities are "
1527
+ "renormalized."
1528
+ "OBS.: Only image sampling is supported for now."
1529
+ ),
1530
+ )
1531
+
1532
+ def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]:
1533
+ # If already a dict (e.g., programmatic call), normalize keys
1534
+ def normalize(d: dict) -> dict[tuple[int, int, int], float]:
1535
+ out: dict[tuple[int, int, int], float] = {}
1536
+ for k, val in d.items():
1537
+ key = k
1538
+ if isinstance(key, str):
1539
+ with suppress(Exception):
1540
+ key = ast.literal_eval(key)
1541
+ if not (
1542
+ isinstance(key, tuple)
1543
+ and len(key) == 3
1544
+ and all(isinstance(x, int) for x in key)
1545
+ ):
1546
+ raise ValueError(
1547
+ f"Invalid bucket key {k!r}. Expected tuple (H, W, T)."
1548
+ )
1549
+ out[(int(key[0]), int(key[1]), int(key[2]))] = float(val)
1550
+ return out
1551
+
1552
+ if isinstance(v, dict):
1553
+ return normalize(v)
1554
+ if isinstance(v, str):
1555
+ # Python literal (supports tuple keys)
1556
+ parsed = ast.literal_eval(v)
1557
+ if not isinstance(parsed, dict):
1558
+ raise ValueError("Bucket config must parse to a dict.")
1559
+ return normalize(parsed)
1560
+ raise ValueError("Unsupported value for --random-mm-bucket-config.")
1561
+
1562
+ random_mm_group.add_argument(
1563
+ "--random-mm-bucket-config",
1564
+ type=_parse_mm_bucket_config,
1565
+ default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG,
1566
+ help=(
1567
+ "The bucket config is a dictionary mapping a multimodal item"
1568
+ "sampling configuration to a probability."
1569
+ "Currently allows for 2 modalities: images and videos. "
1570
+ "An bucket key is a tuple of (height, width, num_frames)"
1571
+ "The value is the probability of sampling that specific item. "
1572
+ "Example: "
1573
+ "--random-mm-bucket-config "
1574
+ "{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} "
1575
+ "First item: images with resolution 256x256 w.p. 0.5"
1576
+ "Second item: images with resolution 720x1280 w.p. 0.4 "
1577
+ "Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1"
1578
+ "OBS.: If the probabilities do not sum to 1, they are normalized."
1579
+ "OBS bis.: Only image sampling is supported for now."
1580
+ ),
1581
+ )
1582
+
1583
+ hf_group = parser.add_argument_group("hf dataset options")
1584
+ hf_group.add_argument(
1585
+ "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
1586
+ )
1587
+ hf_group.add_argument(
1588
+ "--hf-split", type=str, default=None, help="Split of the HF dataset."
1589
+ )
1590
+ hf_group.add_argument(
1591
+ "--hf-name",
1592
+ type=str,
1593
+ default=None,
1594
+ help=(
1595
+ "Name of the dataset on HuggingFace "
1596
+ "(e.g., 'lmarena-ai/VisionArena-Chat'). "
1597
+ "Specify this if your dataset-path is a local path."
1598
+ ),
1599
+ )
1600
+ hf_group.add_argument(
1601
+ "--hf-output-len",
1602
+ type=int,
1603
+ default=None,
1604
+ help="Output length for each request. Overrides the output lengths "
1605
+ "from the sampled HF dataset.",
1606
+ )
1607
+
1608
+ prefix_repetition_group = parser.add_argument_group(
1609
+ "prefix repetition dataset options"
1610
+ )
1611
+ prefix_repetition_group.add_argument(
1612
+ "--prefix-repetition-prefix-len",
1613
+ type=int,
1614
+ default=256,
1615
+ help="Number of prefix tokens per request, used only for prefix "
1616
+ "repetition dataset.",
1617
+ )
1618
+ prefix_repetition_group.add_argument(
1619
+ "--prefix-repetition-suffix-len",
1620
+ type=int,
1621
+ default=256,
1622
+ help="Number of suffix tokens per request, used only for prefix "
1623
+ "repetition dataset. Total input length is prefix_len + suffix_len.",
1624
+ )
1625
+ prefix_repetition_group.add_argument(
1626
+ "--prefix-repetition-num-prefixes",
1627
+ type=int,
1628
+ default=10,
1629
+ help="Number of prefixes to generate, used only for prefix repetition "
1630
+ "dataset. Prompts per prefix is num_requests // num_prefixes.",
1631
+ )
1632
+ prefix_repetition_group.add_argument(
1633
+ "--prefix-repetition-output-len",
1634
+ type=int,
1635
+ default=128,
1636
+ help="Number of output tokens per request, used only for prefix "
1637
+ "repetition dataset.",
1638
+ )
1639
+
1640
+
1641
+ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
1642
+ if not hasattr(args, "request_id_prefix"):
1643
+ args.request_id_prefix = ""
1644
+
1645
+ if args.dataset_name == "custom":
1646
+ dataset = CustomDataset(
1647
+ dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
1648
+ )
1649
+ input_requests = dataset.sample(
1650
+ num_requests=args.num_prompts,
1651
+ tokenizer=tokenizer,
1652
+ output_len=args.custom_output_len,
1653
+ skip_chat_template=args.skip_chat_template,
1654
+ request_id_prefix=args.request_id_prefix,
1655
+ no_oversample=args.no_oversample,
1656
+ )
1657
+
1658
+ elif args.dataset_name == "sonnet":
1659
+ dataset = SonnetDataset(
1660
+ dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
1661
+ )
1662
+ # For the "sonnet" dataset, formatting depends on the backend.
1663
+ if args.backend == "openai-chat":
1664
+ input_requests = dataset.sample(
1665
+ num_requests=args.num_prompts,
1666
+ input_len=args.sonnet_input_len,
1667
+ output_len=args.sonnet_output_len,
1668
+ prefix_len=args.sonnet_prefix_len,
1669
+ tokenizer=tokenizer,
1670
+ return_prompt_formatted=False,
1671
+ request_id_prefix=args.request_id_prefix,
1672
+ no_oversample=args.no_oversample,
1673
+ )
1674
+ else:
1675
+ assert tokenizer.chat_template or tokenizer.default_chat_template, (
1676
+ "Tokenizer/model must have chat template for sonnet dataset."
1677
+ )
1678
+ input_requests = dataset.sample(
1679
+ num_requests=args.num_prompts,
1680
+ input_len=args.sonnet_input_len,
1681
+ output_len=args.sonnet_output_len,
1682
+ prefix_len=args.sonnet_prefix_len,
1683
+ tokenizer=tokenizer,
1684
+ return_prompt_formatted=True,
1685
+ request_id_prefix=args.request_id_prefix,
1686
+ no_oversample=args.no_oversample,
1687
+ )
1688
+
1689
+ elif args.dataset_name == "hf":
1690
+ # all following datasets are implemented from the
1691
+ # HuggingFaceDataset base class
1692
+ hf_kwargs = {}
1693
+ if (
1694
+ args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1695
+ or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1696
+ ):
1697
+ dataset_class = VisionArenaDataset
1698
+ args.hf_split = "train"
1699
+ args.hf_subset = None
1700
+ elif (
1701
+ args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS
1702
+ or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS
1703
+ ):
1704
+ dataset_class = MMVUDataset
1705
+ args.hf_split = "validation"
1706
+ args.hf_subset = None
1707
+ elif (
1708
+ args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1709
+ or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1710
+ ):
1711
+ dataset_class = InstructCoderDataset
1712
+ args.hf_split = "train"
1713
+ elif (
1714
+ args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
1715
+ or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
1716
+ ):
1717
+ dataset_class = MTBenchDataset
1718
+ args.hf_split = "train"
1719
+ elif (
1720
+ args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
1721
+ or args.hf_name in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
1722
+ ):
1723
+ dataset_class = MultiModalConversationDataset
1724
+ elif (
1725
+ args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
1726
+ or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
1727
+ ):
1728
+ dataset_class = ConversationDataset
1729
+ elif (
1730
+ args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
1731
+ or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
1732
+ ):
1733
+ dataset_class = AIMODataset
1734
+ args.hf_split = "train"
1735
+ elif (
1736
+ args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501
1737
+ or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
1738
+ ):
1739
+ dataset_class = NextEditPredictionDataset
1740
+ args.hf_split = "train"
1741
+ elif (
1742
+ args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
1743
+ or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
1744
+ ):
1745
+ dataset_class = ASRDataset
1746
+ args.hf_split = "train"
1747
+ elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS:
1748
+ dataset_class = BlazeditDataset
1749
+ args.hf_split = "train"
1750
+ hf_kwargs = {
1751
+ "min_distance": args.blazedit_min_distance,
1752
+ "max_distance": args.blazedit_max_distance,
1753
+ }
1754
+ elif (
1755
+ args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
1756
+ or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
1757
+ ):
1758
+ dataset_class = MLPerfDataset
1759
+ args.hf_split = "train"
1760
+ elif (
1761
+ args.dataset_path in MMStarDataset.SUPPORTED_DATASET_PATHS
1762
+ or args.hf_name in MMStarDataset.SUPPORTED_DATASET_PATHS
1763
+ ):
1764
+ dataset_class = MMStarDataset
1765
+ args.hf_split = "val"
1766
+ args.hf_subset = None
1767
+ else:
1768
+ supported_datasets = set(
1769
+ [
1770
+ dataset_name
1771
+ for cls in HuggingFaceDataset.__subclasses__()
1772
+ for dataset_name in cls.SUPPORTED_DATASET_PATHS
1773
+ ]
1774
+ )
1775
+ raise ValueError(
1776
+ f"Unsupported dataset path: {args.dataset_path}. "
1777
+ "Huggingface dataset only supports dataset_path"
1778
+ f" from one of following: {supported_datasets}. "
1779
+ "Please consider contributing if you would "
1780
+ "like to add support for additional dataset formats."
1781
+ )
1782
+
1783
+ if dataset_class.IS_MULTIMODAL and not (
1784
+ args.backend in ("openai-chat", "openai-audio")
1785
+ or "embeddings-" in args.backend
1786
+ ):
1787
+ # multi-modal benchmark is only available on OpenAI Chat
1788
+ # endpoint-type.
1789
+ raise ValueError(
1790
+ "Multi-modal content is only supported on 'openai-chat' and "
1791
+ "'openai-audio' backends."
1792
+ )
1793
+ input_requests = dataset_class(
1794
+ dataset_path=args.dataset_path,
1795
+ dataset_subset=args.hf_subset,
1796
+ dataset_split=args.hf_split,
1797
+ random_seed=args.seed,
1798
+ no_stream=args.no_stream,
1799
+ hf_name=args.hf_name,
1800
+ disable_shuffle=args.disable_shuffle,
1801
+ ).sample(
1802
+ num_requests=args.num_prompts,
1803
+ tokenizer=tokenizer,
1804
+ output_len=args.hf_output_len,
1805
+ request_id_prefix=args.request_id_prefix,
1806
+ no_oversample=args.no_oversample,
1807
+ skip_chat_template=args.skip_chat_template,
1808
+ **hf_kwargs,
1809
+ )
1810
+
1811
+ else:
1812
+ # For datasets that follow a similar structure, use a mapping.
1813
+ dataset_mapping = {
1814
+ "spec_bench": lambda: SpecBench(
1815
+ dataset_path=args.dataset_path,
1816
+ category=args.spec_bench_category,
1817
+ disable_shuffle=args.disable_shuffle,
1818
+ ).sample(
1819
+ num_requests=args.num_prompts,
1820
+ tokenizer=tokenizer,
1821
+ output_len=args.spec_bench_output_len,
1822
+ request_id_prefix=args.request_id_prefix,
1823
+ no_oversample=args.no_oversample,
1824
+ ),
1825
+ "sharegpt": lambda: ShareGPTDataset(
1826
+ random_seed=args.seed,
1827
+ dataset_path=args.dataset_path,
1828
+ disable_shuffle=args.disable_shuffle,
1829
+ ).sample(
1830
+ tokenizer=tokenizer,
1831
+ num_requests=args.num_prompts,
1832
+ output_len=args.sharegpt_output_len,
1833
+ request_id_prefix=args.request_id_prefix,
1834
+ no_oversample=args.no_oversample,
1835
+ ),
1836
+ "burstgpt": lambda: BurstGPTDataset(
1837
+ random_seed=args.seed,
1838
+ dataset_path=args.dataset_path,
1839
+ disable_shuffle=args.disable_shuffle,
1840
+ ).sample(
1841
+ tokenizer=tokenizer,
1842
+ num_requests=args.num_prompts,
1843
+ request_id_prefix=args.request_id_prefix,
1844
+ no_oversample=args.no_oversample,
1845
+ ),
1846
+ "random": lambda: RandomDataset(
1847
+ random_seed=args.seed,
1848
+ dataset_path=args.dataset_path,
1849
+ disable_shuffle=args.disable_shuffle,
1850
+ prefix_len=args.common_prefix_len,
1851
+ ).sample(
1852
+ tokenizer=tokenizer,
1853
+ num_requests=args.num_prompts,
1854
+ prefix_len=args.random_prefix_len,
1855
+ input_len=args.random_input_len,
1856
+ output_len=args.random_output_len,
1857
+ range_ratio=args.random_range_ratio,
1858
+ request_id_prefix=args.request_id_prefix,
1859
+ batchsize=args.random_batch_size,
1860
+ no_oversample=args.no_oversample,
1861
+ ),
1862
+ "random-mm": lambda: RandomMultiModalDataset(
1863
+ random_seed=args.seed,
1864
+ dataset_path=args.dataset_path,
1865
+ disable_shuffle=args.disable_shuffle,
1866
+ ).sample(
1867
+ tokenizer=tokenizer,
1868
+ num_requests=args.num_prompts,
1869
+ prefix_len=args.random_prefix_len,
1870
+ range_ratio=args.random_range_ratio,
1871
+ input_len=args.random_input_len,
1872
+ output_len=args.random_output_len,
1873
+ base_items_per_request=args.random_mm_base_items_per_request,
1874
+ limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
1875
+ num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
1876
+ bucket_config=args.random_mm_bucket_config,
1877
+ request_id_prefix=args.request_id_prefix,
1878
+ no_oversample=args.no_oversample,
1879
+ ),
1880
+ "random-rerank": lambda: RandomDatasetForReranking(
1881
+ random_seed=args.seed,
1882
+ dataset_path=args.dataset_path,
1883
+ disable_shuffle=args.disable_shuffle,
1884
+ ).sample(
1885
+ tokenizer=tokenizer,
1886
+ num_requests=args.num_prompts,
1887
+ input_len=args.random_input_len,
1888
+ range_ratio=args.random_range_ratio,
1889
+ request_id_prefix=args.request_id_prefix,
1890
+ batchsize=args.random_batch_size,
1891
+ is_reranker=not args.no_reranker,
1892
+ ),
1893
+ "prefix_repetition": lambda: PrefixRepetitionRandomDataset(
1894
+ random_seed=args.seed,
1895
+ dataset_path=args.dataset_path,
1896
+ disable_shuffle=args.disable_shuffle,
1897
+ ).sample(
1898
+ tokenizer=tokenizer,
1899
+ num_requests=args.num_prompts,
1900
+ prefix_len=args.prefix_repetition_prefix_len,
1901
+ suffix_len=args.prefix_repetition_suffix_len,
1902
+ num_prefixes=args.prefix_repetition_num_prefixes,
1903
+ output_len=args.prefix_repetition_output_len,
1904
+ request_id_prefix=args.request_id_prefix,
1905
+ no_oversample=args.no_oversample,
1906
+ ),
1907
+ }
1908
+
1909
+ try:
1910
+ # Enforce endpoint compatibility for multimodal datasets.
1911
+ if args.dataset_name == "random-mm" and args.backend not in ["openai-chat"]:
1912
+ raise ValueError(
1913
+ "Multi-modal content (images) is only supported on "
1914
+ "'openai-chat' backend."
1915
+ )
1916
+ input_requests = dataset_mapping[args.dataset_name]()
1917
+ except KeyError as err:
1918
+ raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
1919
+
1920
+ return input_requests
1921
+
1922
+
1923
+ # -----------------------------------------------------------------------------
1924
+ # Custom Dataset Implementation
1925
+ # -----------------------------------------------------------------------------
1926
+
1927
+
1928
+ class CustomDataset(BenchmarkDataset):
1929
+ """
1930
+ Implements the Custom dataset. Loads data from a JSONL file and generates
1931
+ sample requests based on conversation turns. E.g.,
1932
+ ```
1933
+ {"prompt": "What is the capital of India?"}
1934
+ {"prompt": "What is the capital of Iran?"}
1935
+ {"prompt": "What is the capital of China?"}
1936
+ ```
1937
+ """
1938
+
1939
+ def __init__(self, **kwargs) -> None:
1940
+ super().__init__(**kwargs)
1941
+ self.load_data()
1942
+
1943
+ def load_data(self) -> None:
1944
+ if self.dataset_path is None:
1945
+ raise ValueError("dataset_path must be provided for loading data.")
1946
+
1947
+ # self.data will be a list of dictionaries
1948
+ # e.g., [{"prompt": "What is the capital of India?"}, ...]
1949
+ # This will be the standardized format which load_data()
1950
+ # has to convert into depending on the filetype of dataset_path.
1951
+ # sample() will assume this standardized format of self.data
1952
+ self.data = []
1953
+
1954
+ # Load the JSONL file
1955
+ if self.dataset_path.endswith(".jsonl"):
1956
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
1957
+
1958
+ # check if the JSONL file has a 'prompt' column
1959
+ if "prompt" not in jsonl_data.columns:
1960
+ raise ValueError("JSONL file must contain a 'prompt' column.")
1961
+
1962
+ # Convert each row to a dictionary and append to self.data
1963
+ # This will convert the DataFrame to a list of dictionaries
1964
+ # where each dictionary corresponds to a row in the DataFrame.
1965
+ # This is the standardized format we want for self.data
1966
+ for _, row in jsonl_data.iterrows():
1967
+ self.data.append(row.to_dict())
1968
+ else:
1969
+ raise NotImplementedError(
1970
+ "Only JSONL format is supported for CustomDataset."
1971
+ )
1972
+
1973
+ random.seed(self.random_seed)
1974
+ if not getattr(self, "disable_shuffle", False):
1975
+ random.shuffle(self.data)
1976
+
1977
+ def sample(
1978
+ self,
1979
+ tokenizer: TokenizerLike,
1980
+ num_requests: int,
1981
+ lora_path: str | None = None,
1982
+ max_loras: int | None = None,
1983
+ output_len: int | None = None,
1984
+ enable_multimodal_chat: bool = False,
1985
+ skip_chat_template: bool = False,
1986
+ request_id_prefix: str = "",
1987
+ no_oversample: bool = False,
1988
+ **kwargs,
1989
+ ) -> list:
1990
+ # load all data if needed
1991
+ self.num_available_samples = len(self.data)
1992
+ if num_requests <= 0:
1993
+ num_requests = self.num_available_samples
1994
+ logger.info(
1995
+ "num_requests is set to 0 or negative, "
1996
+ "so using all available samples: %d",
1997
+ num_requests,
1998
+ )
1999
+
2000
+ sampled_requests = []
2001
+ for i, item in enumerate(self.data):
2002
+ if len(sampled_requests) >= num_requests:
2003
+ break
2004
+ prompt = item["prompt"]
2005
+
2006
+ # apply template
2007
+ if not skip_chat_template:
2008
+ prompt = tokenizer.apply_chat_template(
2009
+ [{"role": "user", "content": prompt}],
2010
+ add_generation_prompt=True,
2011
+ tokenize=False,
2012
+ )
2013
+
2014
+ prompt_len = len(tokenizer(prompt).input_ids)
2015
+ sampled_requests.append(
2016
+ SampleRequest(
2017
+ prompt=prompt,
2018
+ prompt_len=prompt_len,
2019
+ expected_output_len=output_len,
2020
+ request_id=request_id_prefix + str(i),
2021
+ )
2022
+ )
2023
+ self.maybe_oversample_requests(
2024
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2025
+ )
2026
+
2027
+ return sampled_requests
2028
+
2029
+
2030
+ # -----------------------------------------------------------------------------
2031
+ # Spec Bench Dataset Implementation
2032
+ # -----------------------------------------------------------------------------
2033
+
2034
+
2035
+ class SpecBench(CustomDataset):
2036
+ """
2037
+ Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
2038
+ Download the dataset using:
2039
+ wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
2040
+ """ # noqa: E501
2041
+
2042
+ def __init__(self, **kwargs) -> None:
2043
+ self.category = kwargs.pop("category", None)
2044
+ super().__init__(**kwargs)
2045
+ self.load_data()
2046
+
2047
+ def load_data(self) -> None:
2048
+ if self.dataset_path is None:
2049
+ raise ValueError("dataset_path must be provided for loading data.")
2050
+
2051
+ self.data = []
2052
+
2053
+ # Load the JSONL file
2054
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
2055
+
2056
+ # check if the JSONL file has a 'turns' column
2057
+ if "turns" not in jsonl_data.columns:
2058
+ raise ValueError("JSONL file must contain a 'turns' column.")
2059
+
2060
+ for _, row in jsonl_data.iterrows():
2061
+ # sample only from a specific category if specified
2062
+ if (not self.category) or (self.category == row["category"]):
2063
+ prompt = row["turns"][0]
2064
+ self.data.append({"prompt": prompt})
2065
+
2066
+ random.seed(self.random_seed)
2067
+ if not getattr(self, "disable_shuffle", False):
2068
+ random.shuffle(self.data)
2069
+
2070
+ def sample(self, **kwargs) -> list:
2071
+ # leverage CustomDataset sample
2072
+ return super().sample(**kwargs)
2073
+
2074
+
2075
+ # -----------------------------------------------------------------------------
2076
+ # Sonnet Dataset Implementation
2077
+ # -----------------------------------------------------------------------------
2078
+
2079
+
2080
+ @deprecated(
2081
+ "SonnetDataset is deprecated and will be removed in a future version.",
2082
+ )
2083
+ class SonnetDataset(BenchmarkDataset):
2084
+ """
2085
+ Simplified implementation of the Sonnet dataset. Loads poem lines from a
2086
+ text file and generates sample requests. Default values here copied from
2087
+ `benchmark_serving.py` for the sonnet dataset.
2088
+ """
2089
+
2090
+ DEFAULT_PREFIX_LEN = 200
2091
+ DEFAULT_INPUT_LEN = 550
2092
+ DEFAULT_OUTPUT_LEN = 150
2093
+
2094
+ def __init__(
2095
+ self,
2096
+ **kwargs,
2097
+ ) -> None:
2098
+ super().__init__(**kwargs)
2099
+ self.load_data()
2100
+
2101
+ def load_data(self) -> None:
2102
+ if not self.dataset_path:
2103
+ raise ValueError("dataset_path must be provided.")
2104
+ with open(self.dataset_path, encoding="utf-8") as f:
2105
+ self.data = f.readlines()
2106
+
2107
+ def sample(
2108
+ self,
2109
+ tokenizer: TokenizerLike,
2110
+ num_requests: int,
2111
+ prefix_len: int = DEFAULT_PREFIX_LEN,
2112
+ input_len: int = DEFAULT_INPUT_LEN,
2113
+ output_len: int = DEFAULT_OUTPUT_LEN,
2114
+ return_prompt_formatted: bool = False,
2115
+ request_id_prefix: str = "",
2116
+ no_oversample: bool = False,
2117
+ **kwargs,
2118
+ ) -> list:
2119
+ # Calculate average token length for a poem line.
2120
+ tokenized_lines = [tokenizer(line).input_ids for line in self.data]
2121
+ avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
2122
+
2123
+ # Build the base prompt.
2124
+ base_prompt = "Pick as many lines as you can from these poem lines:\n"
2125
+ base_msg = [{"role": "user", "content": base_prompt}]
2126
+ base_fmt = tokenizer.apply_chat_template(
2127
+ base_msg, add_generation_prompt=True, tokenize=False
2128
+ )
2129
+ base_offset = len(tokenizer(base_fmt).input_ids)
2130
+ if input_len <= base_offset:
2131
+ raise ValueError(
2132
+ f"'input_len' must be higher than the base prompt length "
2133
+ f"({base_offset})."
2134
+ )
2135
+
2136
+ # Determine how many poem lines to use.
2137
+ num_input_lines = round((input_len - base_offset) / avg_len)
2138
+ num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
2139
+ prefix_lines = self.data[:num_prefix_lines]
2140
+
2141
+ samples = []
2142
+ ind = 0
2143
+ while len(samples) < num_requests:
2144
+ extra_lines = random.choices(
2145
+ self.data, k=num_input_lines - num_prefix_lines
2146
+ )
2147
+ prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
2148
+ msg = [{"role": "user", "content": prompt}]
2149
+ prompt_formatted = tokenizer.apply_chat_template(
2150
+ msg, add_generation_prompt=True, tokenize=False
2151
+ )
2152
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
2153
+ if prompt_len <= input_len:
2154
+ samples.append(
2155
+ SampleRequest(
2156
+ prompt=prompt_formatted if return_prompt_formatted else prompt,
2157
+ prompt_len=prompt_len,
2158
+ expected_output_len=output_len,
2159
+ request_id=request_id_prefix + str(ind),
2160
+ )
2161
+ )
2162
+ ind += 1
2163
+ return samples
2164
+
2165
+
2166
+ # -----------------------------------------------------------------------------
2167
+ # BurstGPT Dataset Implementation
2168
+ # -----------------------------------------------------------------------------
2169
+
2170
+
2171
+ class BurstGPTDataset(BenchmarkDataset):
2172
+ """
2173
+ Implements the BurstGPT dataset. Loads data from a CSV file and generates
2174
+ sample requests based on synthetic prompt generation. Only rows with Model
2175
+ "GPT-4" and positive response tokens are used.
2176
+ """
2177
+
2178
+ def __init__(self, **kwargs) -> None:
2179
+ super().__init__(**kwargs)
2180
+ self.load_data()
2181
+
2182
+ def load_data(
2183
+ self,
2184
+ ):
2185
+ if self.dataset_path is None:
2186
+ raise ValueError("dataset_path must be provided for loading data.")
2187
+
2188
+ df = pd.read_csv(self.dataset_path)
2189
+ # Filter to keep only GPT-4 rows.
2190
+ gpt4_df = df[df["Model"] == "GPT-4"]
2191
+ # Remove failed requests (where Response tokens is 0 or less).
2192
+ gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
2193
+ # Sample the desired number of rows.
2194
+ self.data = gpt4_df
2195
+
2196
+ def _sample_loaded_data(self, num_requests: int) -> list:
2197
+ if num_requests <= len(self.data):
2198
+ data = self.data.sample(n=num_requests, random_state=self.random_seed)
2199
+ else:
2200
+ data = self.data.sample(
2201
+ n=num_requests,
2202
+ random_state=self.random_seed,
2203
+ replace=True,
2204
+ )
2205
+ # Convert the dataframe to a list of lists.
2206
+ return data.values.tolist()
2207
+
2208
+ def sample(
2209
+ self,
2210
+ tokenizer: TokenizerLike,
2211
+ num_requests: int,
2212
+ max_loras: int | None = None,
2213
+ lora_path: str | None = None,
2214
+ request_id_prefix: str = "",
2215
+ no_oversample: bool = False,
2216
+ **kwargs,
2217
+ ) -> list[SampleRequest]:
2218
+ samples = []
2219
+ data = self._sample_loaded_data(num_requests=num_requests)
2220
+ for i in range(num_requests):
2221
+ input_len = int(data[i][2])
2222
+ output_len = int(data[i][3])
2223
+ lora_req = self.get_random_lora_request(
2224
+ max_loras=max_loras, lora_path=lora_path
2225
+ )
2226
+ vocab_size = tokenizer.vocab_size
2227
+ # Generate a synthetic prompt: a list of token IDs computed as (i +
2228
+ # j) modulo vocab_size.
2229
+ token_ids = [(i + j) % vocab_size for j in range(input_len)]
2230
+ prompt = tokenizer.decode(token_ids)
2231
+ samples.append(
2232
+ SampleRequest(
2233
+ prompt=prompt,
2234
+ prompt_len=input_len,
2235
+ expected_output_len=output_len,
2236
+ lora_request=lora_req,
2237
+ request_id=request_id_prefix + str(i),
2238
+ )
2239
+ )
2240
+ return samples
2241
+
2242
+
2243
+ # -----------------------------------------------------------------------------
2244
+ # HuggingFace Dataset Base Implementation
2245
+ # -----------------------------------------------------------------------------
2246
+ class HuggingFaceDataset(BenchmarkDataset):
2247
+ """Base class for datasets hosted on HuggingFace."""
2248
+
2249
+ SUPPORTED_DATASET_PATHS: set[str] | dict[str, Callable] = set()
2250
+
2251
+ def __init__(
2252
+ self,
2253
+ dataset_path: str,
2254
+ dataset_split: str,
2255
+ no_stream: bool = False,
2256
+ dataset_subset: str | None = None,
2257
+ hf_name: str | None = None,
2258
+ **kwargs,
2259
+ ) -> None:
2260
+ super().__init__(dataset_path=dataset_path, **kwargs)
2261
+
2262
+ self.dataset_split = dataset_split
2263
+ self.dataset_subset = dataset_subset
2264
+ self.load_stream = not no_stream
2265
+ self.hf_name = hf_name or dataset_path
2266
+ self.load_data()
2267
+
2268
+ def load_data(self) -> None:
2269
+ """Load data from HuggingFace datasets."""
2270
+ self.data = load_dataset(
2271
+ self.dataset_path,
2272
+ name=self.dataset_subset,
2273
+ split=self.dataset_split,
2274
+ streaming=self.load_stream,
2275
+ )
2276
+ if not getattr(self, "disable_shuffle", False):
2277
+ self.data = self.data.shuffle(seed=self.random_seed)
2278
+
2279
+
2280
+ # -----------------------------------------------------------------------------
2281
+ # Conversation Dataset Implementation
2282
+ # -----------------------------------------------------------------------------
2283
+
2284
+
2285
+ class ConversationDataset(HuggingFaceDataset):
2286
+ """Dataset for text-only conversation data."""
2287
+
2288
+ SUPPORTED_DATASET_PATHS = {
2289
+ "Aeala/ShareGPT_Vicuna_unfiltered",
2290
+ }
2291
+ IS_MULTIMODAL = False
2292
+
2293
+ def sample(
2294
+ self,
2295
+ tokenizer: TokenizerLike,
2296
+ num_requests: int,
2297
+ output_len: int | None = None,
2298
+ enable_multimodal_chat: bool = False,
2299
+ request_id_prefix: str = "",
2300
+ no_oversample: bool = False,
2301
+ **kwargs,
2302
+ ) -> list:
2303
+ # Filter examples with at least 2 conversations
2304
+ filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
2305
+ sampled_requests = []
2306
+ ind = 0
2307
+ dynamic_output = output_len is None
2308
+
2309
+ for item in filtered_data:
2310
+ if len(sampled_requests) >= num_requests:
2311
+ break
2312
+ conv = item["conversations"]
2313
+ prompt, completion = conv[0]["value"], conv[1]["value"]
2314
+
2315
+ prompt_ids = tokenizer(prompt).input_ids
2316
+ completion_ids = tokenizer(completion).input_ids
2317
+ prompt_len = len(prompt_ids)
2318
+ completion_len = len(completion_ids)
2319
+ output_len = completion_len if dynamic_output else output_len
2320
+ assert isinstance(output_len, int) and output_len > 0
2321
+ if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
2322
+ continue
2323
+ mm_content = process_image(item["image"]) if "image" in item else None
2324
+ if enable_multimodal_chat:
2325
+ # Note: when chat is enabled the request prompt_len is no longer
2326
+ # accurate and we will be using request output to count the
2327
+ # actual prompt len and output len
2328
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2329
+ sampled_requests.append(
2330
+ SampleRequest(
2331
+ prompt=prompt,
2332
+ prompt_len=prompt_len,
2333
+ expected_output_len=output_len,
2334
+ multi_modal_data=mm_content,
2335
+ request_id=request_id_prefix + str(ind),
2336
+ )
2337
+ )
2338
+ ind += 1
2339
+ self.maybe_oversample_requests(
2340
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2341
+ )
2342
+ return sampled_requests
2343
+
2344
+
2345
+ class MultiModalConversationDataset(HuggingFaceDataset):
2346
+ """Dataset for multimodal conversation data."""
2347
+
2348
+ SUPPORTED_DATASET_PATHS = {
2349
+ "lmms-lab/LLaVA-OneVision-Data",
2350
+ }
2351
+ IS_MULTIMODAL = True
2352
+
2353
+ def sample(
2354
+ self,
2355
+ tokenizer: TokenizerLike,
2356
+ num_requests: int,
2357
+ output_len: int | None = None,
2358
+ enable_multimodal_chat: bool = False,
2359
+ request_id_prefix: str = "",
2360
+ no_oversample: bool = False,
2361
+ **kwargs,
2362
+ ) -> list:
2363
+ # Filter examples with at least 2 conversations
2364
+ filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
2365
+ sampled_requests = []
2366
+ ind = 0
2367
+ dynamic_output = output_len is None
2368
+
2369
+ for item in filtered_data:
2370
+ if len(sampled_requests) >= num_requests:
2371
+ break
2372
+ conv = item["conversations"]
2373
+ prompt, completion = conv[0]["value"], conv[1]["value"]
2374
+
2375
+ prompt_ids = tokenizer(prompt).input_ids
2376
+ completion_ids = tokenizer(completion).input_ids
2377
+ prompt_len = len(prompt_ids)
2378
+ completion_len = len(completion_ids)
2379
+ output_len = completion_len if dynamic_output else output_len
2380
+ assert isinstance(output_len, int) and output_len > 0
2381
+ if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
2382
+ continue
2383
+ mm_content = process_image(item["image"]) if "image" in item else None
2384
+ if enable_multimodal_chat:
2385
+ # Note: when chat is enabled the request prompt_len is no longer
2386
+ # accurate and we will be using request output to count the
2387
+ # actual prompt len and output len
2388
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2389
+ sampled_requests.append(
2390
+ SampleRequest(
2391
+ prompt=prompt,
2392
+ prompt_len=prompt_len,
2393
+ expected_output_len=output_len,
2394
+ multi_modal_data=mm_content,
2395
+ request_id=request_id_prefix + str(ind),
2396
+ )
2397
+ )
2398
+ ind += 1
2399
+ self.maybe_oversample_requests(
2400
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2401
+ )
2402
+ return sampled_requests
2403
+
2404
+
2405
+ # -----------------------------------------------------------------------------
2406
+ # Vision Arena Dataset Implementation
2407
+ # -----------------------------------------------------------------------------
2408
+
2409
+
2410
+ class VisionArenaDataset(HuggingFaceDataset):
2411
+ """
2412
+ Vision Arena Dataset.
2413
+ """
2414
+
2415
+ DEFAULT_OUTPUT_LEN = 128
2416
+ SUPPORTED_DATASET_PATHS = {
2417
+ "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
2418
+ "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
2419
+ }
2420
+ IS_MULTIMODAL = True
2421
+
2422
+ def sample(
2423
+ self,
2424
+ tokenizer: TokenizerLike,
2425
+ num_requests: int,
2426
+ output_len: int | None = None,
2427
+ enable_multimodal_chat: bool = False,
2428
+ request_id_prefix: str = "",
2429
+ no_oversample: bool = False,
2430
+ **kwargs,
2431
+ ) -> list:
2432
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2433
+ sampled_requests = []
2434
+ for i, item in enumerate(self.data):
2435
+ if len(sampled_requests) >= num_requests:
2436
+ break
2437
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2438
+ if parser_fn is None:
2439
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2440
+ prompt = parser_fn(item)
2441
+ mm_content = process_image(item["images"][0])
2442
+ prompt_len = len(tokenizer(prompt).input_ids)
2443
+ if enable_multimodal_chat:
2444
+ # Note: when chat is enabled the request prompt_len is no longer
2445
+ # accurate and we will be using request output to count the
2446
+ # actual prompt len
2447
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2448
+ sampled_requests.append(
2449
+ SampleRequest(
2450
+ prompt=prompt,
2451
+ prompt_len=prompt_len,
2452
+ expected_output_len=output_len,
2453
+ multi_modal_data=mm_content,
2454
+ request_id=request_id_prefix + str(i),
2455
+ )
2456
+ )
2457
+ self.maybe_oversample_requests(
2458
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2459
+ )
2460
+ return sampled_requests
2461
+
2462
+
2463
+ class MMVUDataset(HuggingFaceDataset):
2464
+ """
2465
+ MMVU Dataset.
2466
+ https://huggingface.co/datasets/yale-nlp/MMVU
2467
+ """
2468
+
2469
+ DEFAULT_OUTPUT_LEN = 128
2470
+ SUPPORTED_DATASET_PATHS = {
2471
+ "yale-nlp/MMVU": lambda x: x["question"]
2472
+ + " "
2473
+ + (" ".join(f"{k}.{v}" for k, v in x["choices"].items())),
2474
+ }
2475
+
2476
+ def sample(
2477
+ self,
2478
+ tokenizer: TokenizerLike,
2479
+ num_requests: int,
2480
+ output_len: int | None = None,
2481
+ enable_multimodal_chat: bool = False,
2482
+ request_id_prefix: str = "",
2483
+ no_oversample: bool = False,
2484
+ **kwargs,
2485
+ ) -> list:
2486
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2487
+ sampled_requests = []
2488
+ for i, item in enumerate(self.data):
2489
+ if len(sampled_requests) >= num_requests:
2490
+ break
2491
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2492
+ if parser_fn is None:
2493
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2494
+ prompt = parser_fn(item)
2495
+ mm_content = process_video(item["video"])
2496
+ prompt_len = len(tokenizer(prompt).input_ids)
2497
+ if enable_multimodal_chat:
2498
+ # Note: when chat is enabled the request prompt_len is no longer
2499
+ # accurate and we will be using request output to count the
2500
+ # actual prompt len
2501
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2502
+ sampled_requests.append(
2503
+ SampleRequest(
2504
+ prompt=prompt,
2505
+ prompt_len=prompt_len,
2506
+ expected_output_len=output_len,
2507
+ multi_modal_data=mm_content,
2508
+ request_id=request_id_prefix + str(i),
2509
+ )
2510
+ )
2511
+ self.maybe_oversample_requests(
2512
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2513
+ )
2514
+ return sampled_requests
2515
+
2516
+
2517
+ # -----------------------------------------------------------------------------
2518
+ # Instruct Coder Dataset Implementation
2519
+ # -----------------------------------------------------------------------------
2520
+
2521
+
2522
+ class InstructCoderDataset(HuggingFaceDataset):
2523
+ """
2524
+ InstructCoder Dataset.
2525
+ https://huggingface.co/datasets/likaixin/InstructCoder
2526
+
2527
+ InstructCoder is the dataset designed for general code editing. It consists
2528
+ of 114,239 instruction-input-output triplets, and covers multiple distinct
2529
+ code editing scenario.
2530
+ """
2531
+
2532
+ DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
2533
+ SUPPORTED_DATASET_PATHS = {
2534
+ "likaixin/InstructCoder",
2535
+ }
2536
+
2537
+ def sample(
2538
+ self,
2539
+ tokenizer: TokenizerLike,
2540
+ num_requests: int,
2541
+ output_len: int | None = None,
2542
+ enable_multimodal_chat: bool = False,
2543
+ skip_chat_template: bool = False,
2544
+ request_id_prefix: str = "",
2545
+ no_oversample: bool = False,
2546
+ **kwargs,
2547
+ ) -> list:
2548
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2549
+ sampled_requests = []
2550
+ for i, item in enumerate(self.data):
2551
+ if len(sampled_requests) >= num_requests:
2552
+ break
2553
+ prompt = (
2554
+ f"{item['input']}\n\n{item['instruction']} Just output "
2555
+ "the code, do not include any explanation."
2556
+ )
2557
+
2558
+ # apply template
2559
+ if not skip_chat_template:
2560
+ prompt = tokenizer.apply_chat_template(
2561
+ [{"role": "user", "content": prompt}],
2562
+ add_generation_prompt=True,
2563
+ tokenize=False,
2564
+ )
2565
+
2566
+ prompt_len = len(tokenizer(prompt).input_ids)
2567
+ sampled_requests.append(
2568
+ SampleRequest(
2569
+ prompt=prompt,
2570
+ prompt_len=prompt_len,
2571
+ expected_output_len=output_len,
2572
+ request_id=request_id_prefix + str(i),
2573
+ )
2574
+ )
2575
+ self.maybe_oversample_requests(
2576
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2577
+ )
2578
+ return sampled_requests
2579
+
2580
+
2581
+ # -----------------------------------------------------------------------------
2582
+ # MT-Bench Dataset Implementation
2583
+ # -----------------------------------------------------------------------------
2584
+
2585
+
2586
+ class MTBenchDataset(HuggingFaceDataset):
2587
+ """
2588
+ MT-Bench Dataset.
2589
+ https://huggingface.co/datasets/philschmid/mt-bench
2590
+
2591
+ We create a single turn dataset for MT-Bench.
2592
+ This is similar to Spec decoding benchmark setup in vLLM
2593
+ https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
2594
+ """ # noqa: E501
2595
+
2596
+ DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
2597
+ SUPPORTED_DATASET_PATHS = {
2598
+ "philschmid/mt-bench",
2599
+ }
2600
+
2601
+ def sample(
2602
+ self,
2603
+ tokenizer: TokenizerLike,
2604
+ num_requests: int,
2605
+ output_len: int | None = None,
2606
+ enable_multimodal_chat: bool = False,
2607
+ skip_chat_template: bool = False,
2608
+ request_id_prefix: str = "",
2609
+ no_oversample: bool = False,
2610
+ **kwargs,
2611
+ ) -> list:
2612
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2613
+ sampled_requests = []
2614
+
2615
+ for i, item in enumerate(self.data):
2616
+ if len(sampled_requests) >= num_requests:
2617
+ break
2618
+ prompt = item["turns"][0]
2619
+
2620
+ # apply template
2621
+ if not skip_chat_template:
2622
+ prompt = tokenizer.apply_chat_template(
2623
+ [{"role": "user", "content": prompt}],
2624
+ add_generation_prompt=True,
2625
+ tokenize=False,
2626
+ )
2627
+
2628
+ prompt_len = len(tokenizer(prompt).input_ids)
2629
+ sampled_requests.append(
2630
+ SampleRequest(
2631
+ prompt=prompt,
2632
+ prompt_len=prompt_len,
2633
+ expected_output_len=output_len,
2634
+ request_id=request_id_prefix + str(i),
2635
+ )
2636
+ )
2637
+ self.maybe_oversample_requests(
2638
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2639
+ )
2640
+ return sampled_requests
2641
+
2642
+
2643
+ # -----------------------------------------------------------------------------
2644
+ # Blazedit Dataset Implementation
2645
+ # -----------------------------------------------------------------------------
2646
+
2647
+
2648
+ class BlazeditDataset(HuggingFaceDataset):
2649
+ """
2650
+ Blazedit Dataset.
2651
+ https://github.com/ise-uiuc/blazedit
2652
+
2653
+ 5k char version: vdaita/edit_5k_char
2654
+ 10k char version: vdaita/edit_10k_char
2655
+ """ # noqa: E501
2656
+
2657
+ # 5k char version will have output as ~5k chars
2658
+ # 10k char version will have output as ~10k chars
2659
+ # Assuming 3 char per token, 10k chars will be 3333 tokens
2660
+ # We set default to 4000 to be safe
2661
+ DEFAULT_OUTPUT_LEN = 4000
2662
+ SUPPORTED_DATASET_PATHS = {
2663
+ "vdaita/edit_5k_char",
2664
+ "vdaita/edit_10k_char",
2665
+ }
2666
+
2667
+ def sample(
2668
+ self,
2669
+ tokenizer: TokenizerLike,
2670
+ num_requests: int,
2671
+ output_len: int | None = None,
2672
+ skip_chat_template: bool = False,
2673
+ request_id_prefix: str = "",
2674
+ no_oversample: bool = False,
2675
+ min_distance: float = 0.0,
2676
+ max_distance: float = 1.0,
2677
+ **kwargs,
2678
+ ) -> list:
2679
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2680
+ sampled_requests = []
2681
+
2682
+ for i, item in enumerate(self.data):
2683
+ if len(sampled_requests) >= num_requests:
2684
+ break
2685
+ code = item["code"]
2686
+ change_request = item["change_request"]
2687
+ norm_distance = item["norm_distance"]
2688
+
2689
+ # compare the levenshtein distance normalized by code length
2690
+ if norm_distance < min_distance or norm_distance > max_distance:
2691
+ continue
2692
+
2693
+ # template copied from
2694
+ # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
2695
+ prompt = f"""Given a code file, please apply the change requests and generate the new file.
2696
+
2697
+ Original file:
2698
+ ```python
2699
+ {code}
2700
+ ```
2701
+
2702
+ Change request:
2703
+ {change_request}
2704
+
2705
+ Please generate the new code file in the "New file" section below.""" # noqa: E501
2706
+
2707
+ # apply template
2708
+ if not skip_chat_template:
2709
+ prompt = tokenizer.apply_chat_template(
2710
+ [{"role": "user", "content": prompt}],
2711
+ add_generation_prompt=True,
2712
+ tokenize=False,
2713
+ )
2714
+
2715
+ prompt_len = len(tokenizer(prompt).input_ids)
2716
+
2717
+ sampled_requests.append(
2718
+ SampleRequest(
2719
+ prompt=prompt,
2720
+ prompt_len=prompt_len,
2721
+ expected_output_len=output_len,
2722
+ request_id=request_id_prefix + str(i),
2723
+ )
2724
+ )
2725
+ self.maybe_oversample_requests(
2726
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2727
+ )
2728
+
2729
+ return sampled_requests
2730
+
2731
+
2732
+ # -----------------------------------------------------------------------------
2733
+ # AIMO Dataset Implementation
2734
+ # -----------------------------------------------------------------------------
2735
+
2736
+
2737
+ class AIMODataset(HuggingFaceDataset):
2738
+ """
2739
+ Dataset class for processing a AIMO dataset with reasoning questions.
2740
+ """
2741
+
2742
+ SUPPORTED_DATASET_PATHS = {
2743
+ "AI-MO/aimo-validation-aime",
2744
+ "AI-MO/NuminaMath-1.5",
2745
+ "AI-MO/NuminaMath-CoT",
2746
+ }
2747
+
2748
+ def sample(
2749
+ self,
2750
+ tokenizer: TokenizerLike,
2751
+ num_requests: int,
2752
+ output_len: int | None = None,
2753
+ request_id_prefix: str = "",
2754
+ no_oversample: bool = False,
2755
+ **kwargs,
2756
+ ) -> list:
2757
+ sampled_requests = []
2758
+ ind = 0
2759
+ dynamic_output = output_len is None
2760
+
2761
+ for item in self.data:
2762
+ if len(sampled_requests) >= num_requests:
2763
+ break
2764
+ prompt, completion = item["problem"], item["solution"]
2765
+
2766
+ prompt_ids = tokenizer(prompt).input_ids
2767
+ completion_ids = tokenizer(completion).input_ids
2768
+ prompt_len = len(prompt_ids)
2769
+ completion_len = len(completion_ids)
2770
+ output_len = completion_len if dynamic_output else output_len
2771
+ assert isinstance(output_len, int) and output_len > 0
2772
+ if dynamic_output and not is_valid_sequence(
2773
+ prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
2774
+ ):
2775
+ continue
2776
+ sampled_requests.append(
2777
+ SampleRequest(
2778
+ prompt=prompt,
2779
+ prompt_len=prompt_len,
2780
+ expected_output_len=output_len,
2781
+ multi_modal_data=None,
2782
+ request_id=request_id_prefix + str(ind),
2783
+ )
2784
+ )
2785
+ ind += 1
2786
+ self.maybe_oversample_requests(
2787
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2788
+ )
2789
+ return sampled_requests
2790
+
2791
+
2792
+ # -----------------------------------------------------------------------------
2793
+ # Next Edit Prediction Dataset Implementation
2794
+ # -----------------------------------------------------------------------------
2795
+
2796
+
2797
+ zeta_prompt = """### Instruction:
2798
+ You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
2799
+
2800
+ ### User Edits:
2801
+
2802
+ {}
2803
+
2804
+ ### User Excerpt:
2805
+
2806
+ {}
2807
+
2808
+ ### Response:
2809
+
2810
+ """ # noqa: E501
2811
+
2812
+
2813
+ def _format_zeta_prompt(
2814
+ sample: dict, original_start_marker: str = "<|editable_region_start|>"
2815
+ ) -> dict:
2816
+ """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
2817
+
2818
+ This function formats examples from the NEP dataset
2819
+ into prompts and expected outputs. It could be
2820
+ further extended to support more NEP datasets.
2821
+
2822
+ Args:
2823
+ sample: The dataset sample containing events,
2824
+ inputs, and outputs.
2825
+ original_start_marker: The marker indicating the
2826
+ start of the editable region. Defaults to
2827
+ "<|editable_region_start|>".
2828
+
2829
+ Returns:
2830
+ A dictionary with the formatted prompts and expected outputs.
2831
+ """
2832
+ events = sample["events"]
2833
+ input = sample["input"]
2834
+ output = sample["output"]
2835
+ prompt = zeta_prompt.format(events, input)
2836
+
2837
+ # following the original implementation, extract the focused region
2838
+ # from the raw output
2839
+ output_start_index = output.find(original_start_marker)
2840
+ output_focused_region = output[output_start_index:]
2841
+ expected_output = output_focused_region
2842
+
2843
+ return {"prompt": prompt, "expected_output": expected_output}
2844
+
2845
+
2846
+ class NextEditPredictionDataset(HuggingFaceDataset):
2847
+ """
2848
+ Dataset class for processing a Next Edit Prediction dataset.
2849
+ """
2850
+
2851
+ SUPPORTED_DATASET_PATHS = {
2852
+ "zed-industries/zeta",
2853
+ }
2854
+ MAPPING_PROMPT_FUNCS = {
2855
+ "zed-industries/zeta": _format_zeta_prompt,
2856
+ }
2857
+
2858
+ def sample(
2859
+ self,
2860
+ tokenizer: TokenizerLike,
2861
+ num_requests: int,
2862
+ request_id_prefix: str = "",
2863
+ no_oversample: bool = False,
2864
+ **kwargs,
2865
+ ):
2866
+ formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
2867
+ if formatting_prompt_func is None:
2868
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2869
+ samples = []
2870
+ for i, sample in enumerate(self.data):
2871
+ sample = formatting_prompt_func(sample)
2872
+ samples.append(
2873
+ SampleRequest(
2874
+ prompt=sample["prompt"],
2875
+ prompt_len=len(tokenizer(sample["prompt"]).input_ids),
2876
+ expected_output_len=len(
2877
+ tokenizer(sample["expected_output"]).input_ids
2878
+ ),
2879
+ request_id=request_id_prefix + str(i),
2880
+ )
2881
+ )
2882
+ if len(samples) >= num_requests:
2883
+ break
2884
+ self.maybe_oversample_requests(
2885
+ samples, num_requests, request_id_prefix, no_oversample
2886
+ )
2887
+ return samples
2888
+
2889
+
2890
+ # -----------------------------------------------------------------------------
2891
+ # ASR Dataset Implementation
2892
+ # -----------------------------------------------------------------------------
2893
+
2894
+
2895
+ class ASRDataset(HuggingFaceDataset):
2896
+ """
2897
+ Dataset class for processing a ASR dataset for transcription.
2898
+ Tested on the following set:
2899
+
2900
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2901
+ | Dataset | Domain | Speaking Style | hf-subset |
2902
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2903
+ | TED-LIUM | TED talks | Oratory | release1, release2, release3|
2904
+ | | | | release3-speaker-adaptation |
2905
+ | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
2906
+ | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
2907
+ | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
2908
+ | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
2909
+ | AMI | Meetings | Spontaneous | ihm, sdm |
2910
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2911
+
2912
+ """ # noqa: E501
2913
+
2914
+ SUPPORTED_DATASET_PATHS = {
2915
+ "openslr/librispeech_asr",
2916
+ "facebook/voxpopuli",
2917
+ "LIUM/tedlium",
2918
+ "edinburghcstr/ami",
2919
+ "speechcolab/gigaspeech",
2920
+ "kensho/spgispeech",
2921
+ }
2922
+
2923
+ DEFAULT_OUTPUT_LEN = 128
2924
+ IS_MULTIMODAL = True
2925
+
2926
+ # TODO Whisper-specific. Abstract interface when more models are supported.
2927
+ TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
2928
+ skip_long_audios: bool = True
2929
+
2930
+ def sample(
2931
+ self,
2932
+ tokenizer: TokenizerLike,
2933
+ num_requests: int,
2934
+ output_len: int | None = None,
2935
+ request_id_prefix: str = "",
2936
+ no_oversample: bool = False,
2937
+ **kwargs,
2938
+ ) -> list:
2939
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2940
+ prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
2941
+ prompt_len = len(tokenizer(prompt).input_ids)
2942
+ sampled_requests = []
2943
+ ind = 0
2944
+ skipped = 0
2945
+ for item in self.data:
2946
+ if len(sampled_requests) >= num_requests:
2947
+ break
2948
+ audio = item["audio"]
2949
+ y, sr = audio["array"], audio["sampling_rate"]
2950
+ duration_s = librosa.get_duration(y=y, sr=sr)
2951
+ # Whisper max supported duration
2952
+ if self.skip_long_audios and duration_s > 30:
2953
+ skipped += 1
2954
+ continue
2955
+
2956
+ mm_content = {"audio": (y, sr)}
2957
+ sampled_requests.append(
2958
+ SampleRequest(
2959
+ prompt=prompt,
2960
+ prompt_len=prompt_len,
2961
+ expected_output_len=output_len,
2962
+ multi_modal_data=mm_content,
2963
+ request_id=request_id_prefix + str(ind),
2964
+ )
2965
+ )
2966
+ ind += 1
2967
+ if skipped:
2968
+ logger.warning(
2969
+ "%d samples discarded from dataset due to"
2970
+ " their length being greater than"
2971
+ " what Whisper supports.",
2972
+ skipped,
2973
+ )
2974
+ self.maybe_oversample_requests(
2975
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2976
+ )
2977
+ return sampled_requests
2978
+
2979
+
2980
+ # -----------------------------------------------------------------------------
2981
+ # MLPerf Dataset Implementation
2982
+ # -----------------------------------------------------------------------------
2983
+
2984
+
2985
+ class MLPerfDataset(HuggingFaceDataset):
2986
+ """
2987
+ MLPerf Inference Dataset.
2988
+
2989
+ Dataset on HF:
2990
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data
2991
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data
2992
+
2993
+ Each record contains:
2994
+ - "system_prompt": system role instruction.
2995
+ - "question": user question.
2996
+ - "output": reference answer.
2997
+
2998
+ We combine the system prompt and question into a chat-formatted prompt
2999
+ (using the tokenizer's chat template) and set the expected output length to
3000
+ the tokenized length of the provided reference answer.
3001
+ """
3002
+
3003
+ SUPPORTED_DATASET_PATHS = {
3004
+ "mgoin/mlperf-inference-llama2-data",
3005
+ "mgoin/mlperf-inference-llama3.1-data",
3006
+ }
3007
+
3008
+ def sample(
3009
+ self,
3010
+ tokenizer: TokenizerLike,
3011
+ num_requests: int,
3012
+ output_len: int | None = None,
3013
+ request_id_prefix: str = "",
3014
+ no_oversample: bool = False,
3015
+ **kwargs,
3016
+ ) -> list[SampleRequest]:
3017
+ # Force dynamic output length based on reference completion.
3018
+ dynamic_output = output_len is None
3019
+ sampled_requests: list[SampleRequest] = []
3020
+ ind = 0
3021
+
3022
+ for item in self.data:
3023
+ if len(sampled_requests) >= num_requests:
3024
+ break
3025
+
3026
+ system_prompt = item["system_prompt"]
3027
+ question = item["question"]
3028
+ reference_answer = item["output"]
3029
+
3030
+ # Build chat-style prompt using tokenizer template, if available.
3031
+ messages = [
3032
+ {"role": "system", "content": system_prompt},
3033
+ {"role": "user", "content": question},
3034
+ ]
3035
+ prompt_formatted = tokenizer.apply_chat_template(
3036
+ messages, add_generation_prompt=True, tokenize=False
3037
+ )
3038
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
3039
+
3040
+ # Determine output length from reference answer tokens.
3041
+ ref_out_len = len(
3042
+ tokenizer(reference_answer, add_special_tokens=False).input_ids
3043
+ )
3044
+ expected_output_len = ref_out_len if dynamic_output else output_len
3045
+
3046
+ # Validate sequence lengths.
3047
+ if not is_valid_sequence(prompt_len, expected_output_len):
3048
+ continue
3049
+
3050
+ sampled_requests.append(
3051
+ SampleRequest(
3052
+ prompt=prompt_formatted,
3053
+ prompt_len=prompt_len,
3054
+ expected_output_len=expected_output_len,
3055
+ request_id=request_id_prefix + str(ind),
3056
+ )
3057
+ )
3058
+ ind += 1
3059
+
3060
+ self.maybe_oversample_requests(
3061
+ sampled_requests, num_requests, request_id_prefix, no_oversample
3062
+ )
3063
+ return sampled_requests
3064
+
3065
+
3066
+ # -----------------------------------------------------------------------------
3067
+ # Prefix Repetition Dataset Implementation
3068
+ # -----------------------------------------------------------------------------
3069
+
3070
+
3071
+ class PrefixRepetitionRandomDataset(BenchmarkDataset):
3072
+ # Default values copied from benchmark_serving.py for the repeated prefix
3073
+ # dataset.
3074
+ DEFAULT_PREFIX_LEN = 256
3075
+ DEFAULT_SUFFIX_LEN = 256
3076
+ DEFAULT_NUM_PREFIXES = 10
3077
+ DEFAULT_OUTPUT_LEN = 128
3078
+
3079
+ def __init__(
3080
+ self,
3081
+ **kwargs,
3082
+ ) -> None:
3083
+ super().__init__(**kwargs)
3084
+ random.seed(self.random_seed)
3085
+ np.random.seed(self.random_seed)
3086
+
3087
+ def sample(
3088
+ self,
3089
+ tokenizer: TokenizerLike,
3090
+ num_requests: int,
3091
+ prefix_len: int = DEFAULT_PREFIX_LEN,
3092
+ suffix_len: int = DEFAULT_SUFFIX_LEN,
3093
+ num_prefixes: int = DEFAULT_NUM_PREFIXES,
3094
+ output_len: int = DEFAULT_OUTPUT_LEN,
3095
+ request_id_prefix: str = "",
3096
+ no_oversample: bool = False,
3097
+ **kwargs,
3098
+ ) -> list[SampleRequest]:
3099
+ vocab_size = tokenizer.vocab_size
3100
+ prompts_per_prefix = num_requests // num_prefixes
3101
+ if prompts_per_prefix == 0:
3102
+ raise ValueError(
3103
+ f"num_requests ({num_requests}) must be greater than or equal "
3104
+ f"to num_prefixes ({num_prefixes})"
3105
+ )
3106
+
3107
+ def _generate_exact_length_tokens(target_length: int) -> list[int]:
3108
+ """Generate tokens that decode and re-encode to exactly
3109
+ target_length."""
3110
+ # Generate random tokens
3111
+ tokens = np.random.randint(0, vocab_size, size=target_length).tolist()
3112
+
3113
+ _, adjusted_tokens, token_mismatch = gen_prompt_decode_to_target_len( # noqa: E501
3114
+ tokenizer=tokenizer,
3115
+ token_sequence=tokens,
3116
+ target_token_len=target_length,
3117
+ add_special_tokens=False,
3118
+ )
3119
+ return adjusted_tokens, token_mismatch
3120
+
3121
+ requests = []
3122
+ token_mismatch_total = 0
3123
+ for _ in range(num_prefixes):
3124
+ prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len)
3125
+ token_mismatch_total += prefix_mismatch
3126
+
3127
+ for _ in range(prompts_per_prefix):
3128
+ suffix_tokens, suffix_mismatch = _generate_exact_length_tokens(
3129
+ suffix_len
3130
+ )
3131
+ token_mismatch_total += suffix_mismatch
3132
+ combined_tokens = prefix_tokens + suffix_tokens
3133
+ prompt = tokenizer.decode(combined_tokens)
3134
+ prompt_len = len(combined_tokens)
3135
+ requests.append(
3136
+ SampleRequest(
3137
+ prompt=prompt,
3138
+ prompt_len=prompt_len,
3139
+ expected_output_len=output_len,
3140
+ )
3141
+ )
3142
+
3143
+ if token_mismatch_total != 0:
3144
+ sign = "more" if token_mismatch_total > 0 else "fewer"
3145
+ logger.warning(
3146
+ "Across all generated prompts, there were %d %s tokens "
3147
+ "than expected after decoding and re-encoding. This is "
3148
+ "expected due to the imperfect nature of the sampling "
3149
+ "procedure.",
3150
+ abs(token_mismatch_total),
3151
+ sign,
3152
+ )
3153
+ if not getattr(self, "disable_shuffle", False):
3154
+ random.shuffle(requests)
3155
+ return requests
3156
+
3157
+
3158
+ # -----------------------------------------------------------------------------
3159
+ # MMStar Dataset Implementation
3160
+ # -----------------------------------------------------------------------------
3161
+
3162
+
3163
+ class MMStarDataset(HuggingFaceDataset):
3164
+ """
3165
+ Lin-Chen/MMStar: https://huggingface.co/datasets/Lin-Chen/MMStar
3166
+ refer to: https://github.com/sgl-project/SpecForge/pull/106
3167
+ """
3168
+
3169
+ DEFAULT_OUTPUT_LEN = 128
3170
+ SUPPORTED_DATASET_PATHS = {"Lin-Chen/MMStar"}
3171
+ IS_MULTIMODAL = True
3172
+
3173
+ def sample(
3174
+ self,
3175
+ tokenizer: TokenizerLike,
3176
+ num_requests: int,
3177
+ output_len: int | None = None,
3178
+ enable_multimodal_chat: bool = False,
3179
+ request_id_prefix: str = "",
3180
+ no_oversample: bool = False,
3181
+ **kwargs,
3182
+ ) -> list[SampleRequest]:
3183
+ # If --hf-output-len is not set, use the default output length.
3184
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
3185
+ sampled_requests: list[SampleRequest] = []
3186
+
3187
+ for ind, item in enumerate(self.data):
3188
+ if len(sampled_requests) >= num_requests:
3189
+ break
3190
+ # Split the question text from options
3191
+ # (keep only the part before "Options:").
3192
+ full_q: str = item.get("question", "")
3193
+ question_text = full_q.split("Options:", 1)[0].strip()
3194
+
3195
+ # Multimodal image content.
3196
+ mm_content = process_image(item["image"])
3197
+
3198
+ # Compute prompt token length (note: this is plain text length
3199
+ # if enable_multimodal_chat is False).
3200
+ prompt_len = len(tokenizer(question_text).input_ids)
3201
+
3202
+ if enable_multimodal_chat:
3203
+ # If multimodal content should be embedded in the chat message,
3204
+ # convert to [{"role":"user","content":[...]}]
3205
+ prompt = self.apply_multimodal_chat_transformation(
3206
+ question_text, mm_content
3207
+ )
3208
+ mm_for_request = None # Already embedded in chat content.
3209
+ else:
3210
+ # Default: prompt is plain text,
3211
+ # image is in mm_content for the bench to assemble.
3212
+ prompt = question_text
3213
+ mm_for_request = mm_content
3214
+
3215
+ sampled_requests.append(
3216
+ SampleRequest(
3217
+ prompt=prompt,
3218
+ prompt_len=prompt_len,
3219
+ expected_output_len=output_len,
3220
+ multi_modal_data=mm_for_request,
3221
+ request_id=request_id_prefix + str(ind),
3222
+ )
3223
+ )
3224
+
3225
+ self.maybe_oversample_requests(
3226
+ sampled_requests, num_requests, request_id_prefix, no_oversample
3227
+ )
3228
+ return sampled_requests