vllm-cpu-avx512bf16 0.14.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1712) hide show
  1. vllm/_C.abi3.so +0 -0
  2. vllm/__init__.py +225 -0
  3. vllm/_aiter_ops.py +1511 -0
  4. vllm/_bc_linter.py +54 -0
  5. vllm/_custom_ops.py +3206 -0
  6. vllm/_ipex_ops.py +445 -0
  7. vllm/_version.py +34 -0
  8. vllm/assets/__init__.py +0 -0
  9. vllm/assets/audio.py +43 -0
  10. vllm/assets/base.py +40 -0
  11. vllm/assets/image.py +62 -0
  12. vllm/assets/video.py +149 -0
  13. vllm/attention/__init__.py +0 -0
  14. vllm/attention/layer.py +913 -0
  15. vllm/attention/utils/__init__.py +0 -0
  16. vllm/attention/utils/kv_sharing_utils.py +33 -0
  17. vllm/attention/utils/kv_transfer_utils.py +60 -0
  18. vllm/beam_search.py +88 -0
  19. vllm/benchmarks/__init__.py +0 -0
  20. vllm/benchmarks/datasets.py +3277 -0
  21. vllm/benchmarks/latency.py +172 -0
  22. vllm/benchmarks/lib/__init__.py +3 -0
  23. vllm/benchmarks/lib/endpoint_request_func.py +777 -0
  24. vllm/benchmarks/lib/ready_checker.py +72 -0
  25. vllm/benchmarks/lib/utils.py +79 -0
  26. vllm/benchmarks/mm_processor.py +363 -0
  27. vllm/benchmarks/serve.py +1761 -0
  28. vllm/benchmarks/startup.py +321 -0
  29. vllm/benchmarks/sweep/__init__.py +0 -0
  30. vllm/benchmarks/sweep/cli.py +41 -0
  31. vllm/benchmarks/sweep/param_sweep.py +159 -0
  32. vllm/benchmarks/sweep/plot.py +675 -0
  33. vllm/benchmarks/sweep/plot_pareto.py +393 -0
  34. vllm/benchmarks/sweep/serve.py +450 -0
  35. vllm/benchmarks/sweep/serve_sla.py +459 -0
  36. vllm/benchmarks/sweep/server.py +114 -0
  37. vllm/benchmarks/sweep/sla_sweep.py +138 -0
  38. vllm/benchmarks/sweep/utils.py +4 -0
  39. vllm/benchmarks/throughput.py +946 -0
  40. vllm/collect_env.py +857 -0
  41. vllm/compilation/__init__.py +0 -0
  42. vllm/compilation/activation_quant_fusion.py +214 -0
  43. vllm/compilation/backends.py +840 -0
  44. vllm/compilation/base_static_graph.py +57 -0
  45. vllm/compilation/caching.py +196 -0
  46. vllm/compilation/collective_fusion.py +1224 -0
  47. vllm/compilation/compiler_interface.py +639 -0
  48. vllm/compilation/counter.py +50 -0
  49. vllm/compilation/cuda_graph.py +309 -0
  50. vllm/compilation/decorators.py +662 -0
  51. vllm/compilation/fix_functionalization.py +266 -0
  52. vllm/compilation/fusion.py +570 -0
  53. vllm/compilation/fusion_attn.py +363 -0
  54. vllm/compilation/fx_utils.py +92 -0
  55. vllm/compilation/inductor_pass.py +145 -0
  56. vllm/compilation/matcher_utils.py +454 -0
  57. vllm/compilation/monitor.py +62 -0
  58. vllm/compilation/noop_elimination.py +130 -0
  59. vllm/compilation/partition_rules.py +75 -0
  60. vllm/compilation/pass_manager.py +164 -0
  61. vllm/compilation/piecewise_backend.py +191 -0
  62. vllm/compilation/post_cleanup.py +21 -0
  63. vllm/compilation/qk_norm_rope_fusion.py +244 -0
  64. vllm/compilation/rocm_aiter_fusion.py +401 -0
  65. vllm/compilation/sequence_parallelism.py +368 -0
  66. vllm/compilation/torch25_custom_graph_pass.py +44 -0
  67. vllm/compilation/vllm_inductor_pass.py +180 -0
  68. vllm/compilation/wrapper.py +329 -0
  69. vllm/config/__init__.py +112 -0
  70. vllm/config/attention.py +114 -0
  71. vllm/config/cache.py +233 -0
  72. vllm/config/compilation.py +1149 -0
  73. vllm/config/device.py +75 -0
  74. vllm/config/ec_transfer.py +110 -0
  75. vllm/config/kv_events.py +56 -0
  76. vllm/config/kv_transfer.py +119 -0
  77. vllm/config/load.py +124 -0
  78. vllm/config/lora.py +102 -0
  79. vllm/config/model.py +2026 -0
  80. vllm/config/model_arch.py +57 -0
  81. vllm/config/multimodal.py +247 -0
  82. vllm/config/observability.py +157 -0
  83. vllm/config/parallel.py +703 -0
  84. vllm/config/pooler.py +188 -0
  85. vllm/config/profiler.py +199 -0
  86. vllm/config/scheduler.py +298 -0
  87. vllm/config/speculative.py +656 -0
  88. vllm/config/speech_to_text.py +39 -0
  89. vllm/config/structured_outputs.py +78 -0
  90. vllm/config/utils.py +374 -0
  91. vllm/config/vllm.py +1487 -0
  92. vllm/connections.py +189 -0
  93. vllm/device_allocator/__init__.py +0 -0
  94. vllm/device_allocator/cumem.py +301 -0
  95. vllm/distributed/__init__.py +6 -0
  96. vllm/distributed/communication_op.py +43 -0
  97. vllm/distributed/device_communicators/__init__.py +0 -0
  98. vllm/distributed/device_communicators/all2all.py +509 -0
  99. vllm/distributed/device_communicators/all_reduce_utils.py +344 -0
  100. vllm/distributed/device_communicators/base_device_communicator.py +303 -0
  101. vllm/distributed/device_communicators/cpu_communicator.py +209 -0
  102. vllm/distributed/device_communicators/cuda_communicator.py +346 -0
  103. vllm/distributed/device_communicators/cuda_wrapper.py +190 -0
  104. vllm/distributed/device_communicators/custom_all_reduce.py +326 -0
  105. vllm/distributed/device_communicators/mnnvl_compat.py +27 -0
  106. vllm/distributed/device_communicators/pynccl.py +386 -0
  107. vllm/distributed/device_communicators/pynccl_allocator.py +191 -0
  108. vllm/distributed/device_communicators/pynccl_wrapper.py +567 -0
  109. vllm/distributed/device_communicators/quick_all_reduce.py +290 -0
  110. vllm/distributed/device_communicators/ray_communicator.py +259 -0
  111. vllm/distributed/device_communicators/shm_broadcast.py +778 -0
  112. vllm/distributed/device_communicators/shm_object_storage.py +697 -0
  113. vllm/distributed/device_communicators/symm_mem.py +156 -0
  114. vllm/distributed/device_communicators/xpu_communicator.py +98 -0
  115. vllm/distributed/ec_transfer/__init__.py +14 -0
  116. vllm/distributed/ec_transfer/ec_connector/__init__.py +0 -0
  117. vllm/distributed/ec_transfer/ec_connector/base.py +247 -0
  118. vllm/distributed/ec_transfer/ec_connector/example_connector.py +201 -0
  119. vllm/distributed/ec_transfer/ec_connector/factory.py +85 -0
  120. vllm/distributed/ec_transfer/ec_transfer_state.py +42 -0
  121. vllm/distributed/eplb/__init__.py +3 -0
  122. vllm/distributed/eplb/async_worker.py +115 -0
  123. vllm/distributed/eplb/eplb_state.py +1192 -0
  124. vllm/distributed/eplb/policy/__init__.py +19 -0
  125. vllm/distributed/eplb/policy/abstract.py +43 -0
  126. vllm/distributed/eplb/policy/default.py +376 -0
  127. vllm/distributed/eplb/rebalance_execute.py +699 -0
  128. vllm/distributed/kv_events.py +505 -0
  129. vllm/distributed/kv_transfer/README.md +29 -0
  130. vllm/distributed/kv_transfer/__init__.py +20 -0
  131. vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg +0 -0
  132. vllm/distributed/kv_transfer/kv_connector/__init__.py +0 -0
  133. vllm/distributed/kv_transfer/kv_connector/base.py +10 -0
  134. vllm/distributed/kv_transfer/kv_connector/factory.py +203 -0
  135. vllm/distributed/kv_transfer/kv_connector/utils.py +459 -0
  136. vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +19 -0
  137. vllm/distributed/kv_transfer/kv_connector/v1/base.py +607 -0
  138. vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +419 -0
  139. vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +450 -0
  140. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +344 -0
  141. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/__init__.py +18 -0
  142. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +395 -0
  143. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +211 -0
  144. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +1431 -0
  145. vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +941 -0
  146. vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +186 -0
  147. vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +916 -0
  148. vllm/distributed/kv_transfer/kv_connector/v1/moriio/__init__.py +0 -0
  149. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +321 -0
  150. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +1515 -0
  151. vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +609 -0
  152. vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +477 -0
  153. vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +2688 -0
  154. vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +557 -0
  155. vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py +0 -0
  156. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +531 -0
  157. vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +632 -0
  158. vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +273 -0
  159. vllm/distributed/kv_transfer/kv_transfer_state.py +78 -0
  160. vllm/distributed/parallel_state.py +1809 -0
  161. vllm/distributed/utils.py +545 -0
  162. vllm/engine/__init__.py +0 -0
  163. vllm/engine/arg_utils.py +2137 -0
  164. vllm/engine/async_llm_engine.py +6 -0
  165. vllm/engine/llm_engine.py +6 -0
  166. vllm/engine/protocol.py +194 -0
  167. vllm/entrypoints/__init__.py +0 -0
  168. vllm/entrypoints/anthropic/__init__.py +0 -0
  169. vllm/entrypoints/anthropic/protocol.py +162 -0
  170. vllm/entrypoints/anthropic/serving_messages.py +468 -0
  171. vllm/entrypoints/api_server.py +186 -0
  172. vllm/entrypoints/chat_utils.py +1912 -0
  173. vllm/entrypoints/cli/__init__.py +19 -0
  174. vllm/entrypoints/cli/benchmark/__init__.py +0 -0
  175. vllm/entrypoints/cli/benchmark/base.py +25 -0
  176. vllm/entrypoints/cli/benchmark/latency.py +21 -0
  177. vllm/entrypoints/cli/benchmark/main.py +57 -0
  178. vllm/entrypoints/cli/benchmark/mm_processor.py +21 -0
  179. vllm/entrypoints/cli/benchmark/serve.py +21 -0
  180. vllm/entrypoints/cli/benchmark/startup.py +21 -0
  181. vllm/entrypoints/cli/benchmark/sweep.py +21 -0
  182. vllm/entrypoints/cli/benchmark/throughput.py +21 -0
  183. vllm/entrypoints/cli/collect_env.py +38 -0
  184. vllm/entrypoints/cli/main.py +79 -0
  185. vllm/entrypoints/cli/openai.py +260 -0
  186. vllm/entrypoints/cli/run_batch.py +68 -0
  187. vllm/entrypoints/cli/serve.py +253 -0
  188. vllm/entrypoints/cli/types.py +29 -0
  189. vllm/entrypoints/constants.py +12 -0
  190. vllm/entrypoints/context.py +898 -0
  191. vllm/entrypoints/grpc_server.py +531 -0
  192. vllm/entrypoints/launcher.py +175 -0
  193. vllm/entrypoints/llm.py +1807 -0
  194. vllm/entrypoints/logger.py +86 -0
  195. vllm/entrypoints/openai/__init__.py +0 -0
  196. vllm/entrypoints/openai/api_server.py +1390 -0
  197. vllm/entrypoints/openai/cli_args.py +320 -0
  198. vllm/entrypoints/openai/orca_metrics.py +120 -0
  199. vllm/entrypoints/openai/parser/__init__.py +0 -0
  200. vllm/entrypoints/openai/parser/harmony_utils.py +820 -0
  201. vllm/entrypoints/openai/parser/responses_parser.py +176 -0
  202. vllm/entrypoints/openai/protocol.py +2566 -0
  203. vllm/entrypoints/openai/run_batch.py +635 -0
  204. vllm/entrypoints/openai/serving_chat.py +1897 -0
  205. vllm/entrypoints/openai/serving_chat_stream_harmony.py +101 -0
  206. vllm/entrypoints/openai/serving_completion.py +740 -0
  207. vllm/entrypoints/openai/serving_engine.py +1612 -0
  208. vllm/entrypoints/openai/serving_models.py +309 -0
  209. vllm/entrypoints/openai/serving_responses.py +2552 -0
  210. vllm/entrypoints/openai/serving_transcription.py +168 -0
  211. vllm/entrypoints/openai/speech_to_text.py +711 -0
  212. vllm/entrypoints/openai/utils.py +49 -0
  213. vllm/entrypoints/pooling/__init__.py +16 -0
  214. vllm/entrypoints/pooling/classify/__init__.py +0 -0
  215. vllm/entrypoints/pooling/classify/api_router.py +48 -0
  216. vllm/entrypoints/pooling/classify/protocol.py +181 -0
  217. vllm/entrypoints/pooling/classify/serving.py +233 -0
  218. vllm/entrypoints/pooling/embed/__init__.py +0 -0
  219. vllm/entrypoints/pooling/embed/api_router.py +65 -0
  220. vllm/entrypoints/pooling/embed/conftest.py +28 -0
  221. vllm/entrypoints/pooling/embed/protocol.py +217 -0
  222. vllm/entrypoints/pooling/embed/serving.py +684 -0
  223. vllm/entrypoints/pooling/pooling/__init__.py +0 -0
  224. vllm/entrypoints/pooling/pooling/api_router.py +62 -0
  225. vllm/entrypoints/pooling/pooling/protocol.py +146 -0
  226. vllm/entrypoints/pooling/pooling/serving.py +354 -0
  227. vllm/entrypoints/pooling/score/__init__.py +0 -0
  228. vllm/entrypoints/pooling/score/api_router.py +147 -0
  229. vllm/entrypoints/pooling/score/protocol.py +146 -0
  230. vllm/entrypoints/pooling/score/serving.py +511 -0
  231. vllm/entrypoints/renderer.py +411 -0
  232. vllm/entrypoints/responses_utils.py +218 -0
  233. vllm/entrypoints/sagemaker/__init__.py +4 -0
  234. vllm/entrypoints/sagemaker/routes.py +118 -0
  235. vllm/entrypoints/score_utils.py +271 -0
  236. vllm/entrypoints/serve/__init__.py +94 -0
  237. vllm/entrypoints/serve/cache/__init__.py +0 -0
  238. vllm/entrypoints/serve/cache/api_router.py +61 -0
  239. vllm/entrypoints/serve/disagg/__init__.py +0 -0
  240. vllm/entrypoints/serve/disagg/api_router.py +109 -0
  241. vllm/entrypoints/serve/disagg/protocol.py +90 -0
  242. vllm/entrypoints/serve/disagg/serving.py +285 -0
  243. vllm/entrypoints/serve/elastic_ep/__init__.py +0 -0
  244. vllm/entrypoints/serve/elastic_ep/api_router.py +96 -0
  245. vllm/entrypoints/serve/elastic_ep/middleware.py +49 -0
  246. vllm/entrypoints/serve/instrumentator/__init__.py +0 -0
  247. vllm/entrypoints/serve/instrumentator/health.py +33 -0
  248. vllm/entrypoints/serve/instrumentator/metrics.py +45 -0
  249. vllm/entrypoints/serve/instrumentator/offline_docs.py +50 -0
  250. vllm/entrypoints/serve/instrumentator/server_info.py +56 -0
  251. vllm/entrypoints/serve/instrumentator/static/swagger-ui-bundle.js +2 -0
  252. vllm/entrypoints/serve/instrumentator/static/swagger-ui.css +3 -0
  253. vllm/entrypoints/serve/lora/__init__.py +0 -0
  254. vllm/entrypoints/serve/lora/api_router.py +70 -0
  255. vllm/entrypoints/serve/profile/__init__.py +0 -0
  256. vllm/entrypoints/serve/profile/api_router.py +46 -0
  257. vllm/entrypoints/serve/rlhf/__init__.py +0 -0
  258. vllm/entrypoints/serve/rlhf/api_router.py +102 -0
  259. vllm/entrypoints/serve/rpc/__init__.py +0 -0
  260. vllm/entrypoints/serve/rpc/api_router.py +61 -0
  261. vllm/entrypoints/serve/sleep/__init__.py +0 -0
  262. vllm/entrypoints/serve/sleep/api_router.py +56 -0
  263. vllm/entrypoints/serve/tokenize/__init__.py +0 -0
  264. vllm/entrypoints/serve/tokenize/api_router.py +112 -0
  265. vllm/entrypoints/serve/tokenize/serving.py +204 -0
  266. vllm/entrypoints/ssl.py +78 -0
  267. vllm/entrypoints/tool.py +187 -0
  268. vllm/entrypoints/tool_server.py +234 -0
  269. vllm/entrypoints/utils.py +336 -0
  270. vllm/env_override.py +402 -0
  271. vllm/envs.py +1791 -0
  272. vllm/exceptions.py +36 -0
  273. vllm/forward_context.py +375 -0
  274. vllm/grpc/__init__.py +17 -0
  275. vllm/grpc/compile_protos.py +94 -0
  276. vllm/grpc/vllm_engine.proto +195 -0
  277. vllm/grpc/vllm_engine_pb2.py +77 -0
  278. vllm/grpc/vllm_engine_pb2.pyi +213 -0
  279. vllm/grpc/vllm_engine_pb2_grpc.py +330 -0
  280. vllm/inputs/__init__.py +44 -0
  281. vllm/inputs/data.py +359 -0
  282. vllm/inputs/parse.py +147 -0
  283. vllm/inputs/preprocess.py +716 -0
  284. vllm/logger.py +303 -0
  285. vllm/logging_utils/__init__.py +13 -0
  286. vllm/logging_utils/dump_input.py +83 -0
  287. vllm/logging_utils/formatter.py +127 -0
  288. vllm/logging_utils/lazy.py +20 -0
  289. vllm/logging_utils/log_time.py +34 -0
  290. vllm/logits_process.py +121 -0
  291. vllm/logprobs.py +206 -0
  292. vllm/lora/__init__.py +0 -0
  293. vllm/lora/layers/__init__.py +43 -0
  294. vllm/lora/layers/base.py +66 -0
  295. vllm/lora/layers/base_linear.py +172 -0
  296. vllm/lora/layers/column_parallel_linear.py +577 -0
  297. vllm/lora/layers/fused_moe.py +739 -0
  298. vllm/lora/layers/logits_processor.py +203 -0
  299. vllm/lora/layers/replicated_linear.py +70 -0
  300. vllm/lora/layers/row_parallel_linear.py +176 -0
  301. vllm/lora/layers/utils.py +115 -0
  302. vllm/lora/layers/vocal_parallel_embedding.py +140 -0
  303. vllm/lora/lora_model.py +221 -0
  304. vllm/lora/lora_weights.py +227 -0
  305. vllm/lora/model_manager.py +858 -0
  306. vllm/lora/ops/__init__.py +0 -0
  307. vllm/lora/ops/ipex_ops/__init__.py +6 -0
  308. vllm/lora/ops/ipex_ops/lora_ops.py +57 -0
  309. vllm/lora/ops/torch_ops/__init__.py +20 -0
  310. vllm/lora/ops/torch_ops/lora_ops.py +128 -0
  311. vllm/lora/ops/triton_ops/README_TUNING.md +60 -0
  312. vllm/lora/ops/triton_ops/__init__.py +21 -0
  313. vllm/lora/ops/triton_ops/fused_moe_lora_op.py +677 -0
  314. vllm/lora/ops/triton_ops/kernel_utils.py +340 -0
  315. vllm/lora/ops/triton_ops/lora_expand_op.py +310 -0
  316. vllm/lora/ops/triton_ops/lora_kernel_metadata.py +154 -0
  317. vllm/lora/ops/triton_ops/lora_shrink_op.py +287 -0
  318. vllm/lora/ops/triton_ops/utils.py +313 -0
  319. vllm/lora/peft_helper.py +128 -0
  320. vllm/lora/punica_wrapper/__init__.py +10 -0
  321. vllm/lora/punica_wrapper/punica_base.py +493 -0
  322. vllm/lora/punica_wrapper/punica_cpu.py +351 -0
  323. vllm/lora/punica_wrapper/punica_gpu.py +413 -0
  324. vllm/lora/punica_wrapper/punica_selector.py +21 -0
  325. vllm/lora/punica_wrapper/punica_xpu.py +276 -0
  326. vllm/lora/punica_wrapper/utils.py +150 -0
  327. vllm/lora/request.py +60 -0
  328. vllm/lora/resolver.py +88 -0
  329. vllm/lora/utils.py +281 -0
  330. vllm/lora/worker_manager.py +278 -0
  331. vllm/model_executor/__init__.py +9 -0
  332. vllm/model_executor/custom_op.py +203 -0
  333. vllm/model_executor/layers/__init__.py +0 -0
  334. vllm/model_executor/layers/activation.py +628 -0
  335. vllm/model_executor/layers/attention/__init__.py +0 -0
  336. vllm/model_executor/layers/attention/chunked_local_attention.py +130 -0
  337. vllm/model_executor/layers/attention/cross_attention.py +182 -0
  338. vllm/model_executor/layers/attention/encoder_only_attention.py +103 -0
  339. vllm/model_executor/layers/attention/mm_encoder_attention.py +234 -0
  340. vllm/model_executor/layers/attention/static_sink_attention.py +254 -0
  341. vllm/model_executor/layers/attention_layer_base.py +34 -0
  342. vllm/model_executor/layers/batch_invariant.py +1063 -0
  343. vllm/model_executor/layers/conv.py +262 -0
  344. vllm/model_executor/layers/fla/__init__.py +8 -0
  345. vllm/model_executor/layers/fla/ops/__init__.py +17 -0
  346. vllm/model_executor/layers/fla/ops/chunk.py +240 -0
  347. vllm/model_executor/layers/fla/ops/chunk_delta_h.py +344 -0
  348. vllm/model_executor/layers/fla/ops/chunk_o.py +183 -0
  349. vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py +154 -0
  350. vllm/model_executor/layers/fla/ops/cumsum.py +280 -0
  351. vllm/model_executor/layers/fla/ops/fused_recurrent.py +390 -0
  352. vllm/model_executor/layers/fla/ops/index.py +41 -0
  353. vllm/model_executor/layers/fla/ops/kda.py +1351 -0
  354. vllm/model_executor/layers/fla/ops/l2norm.py +146 -0
  355. vllm/model_executor/layers/fla/ops/layernorm_guard.py +396 -0
  356. vllm/model_executor/layers/fla/ops/op.py +60 -0
  357. vllm/model_executor/layers/fla/ops/solve_tril.py +556 -0
  358. vllm/model_executor/layers/fla/ops/utils.py +194 -0
  359. vllm/model_executor/layers/fla/ops/wy_fast.py +158 -0
  360. vllm/model_executor/layers/fused_moe/__init__.py +120 -0
  361. vllm/model_executor/layers/fused_moe/all2all_utils.py +173 -0
  362. vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +411 -0
  363. vllm/model_executor/layers/fused_moe/config.py +1111 -0
  364. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  365. vllm/model_executor/layers/fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  366. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  367. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  368. vllm/model_executor/layers/fused_moe/configs/E=1,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  369. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  370. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  371. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
  372. vllm/model_executor/layers/fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  373. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  374. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  375. vllm/model_executor/layers/fused_moe/configs/E=1,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  376. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  377. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  378. vllm/model_executor/layers/fused_moe/configs/E=1,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  379. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  380. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  381. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  382. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H100,dtype=fp8_w8a8.json +123 -0
  383. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  384. vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=NVIDIA_H200.json +146 -0
  385. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  386. vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json +147 -0
  387. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  388. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  389. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  390. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  391. vllm/model_executor/layers/fused_moe/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  392. vllm/model_executor/layers/fused_moe/configs/E=128,N=352,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +122 -0
  393. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  394. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  395. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  396. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  397. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  398. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20-3e.json +146 -0
  399. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  400. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  401. vllm/model_executor/layers/fused_moe/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  402. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  403. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  404. vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  405. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  406. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +114 -0
  407. vllm/model_executor/layers/fused_moe/configs/E=128,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  408. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  409. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=AMD_Instinct_MI308X.json +213 -0
  410. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  411. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_B200.json +147 -0
  412. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  413. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  414. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  415. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  416. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  417. vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  418. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  419. vllm/model_executor/layers/fused_moe/configs/E=128,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  420. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json +147 -0
  421. vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json +147 -0
  422. vllm/model_executor/layers/fused_moe/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  423. vllm/model_executor/layers/fused_moe/configs/E=129,N=704,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  424. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=AMD_Instinct_MI300X.json +200 -0
  425. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  426. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  427. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H100.json +146 -0
  428. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  429. vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  430. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  431. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  432. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  433. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  434. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  435. vllm/model_executor/layers/fused_moe/configs/E=16,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  436. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
  437. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  438. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  439. vllm/model_executor/layers/fused_moe/configs/E=16,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  440. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  441. vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H200.json +146 -0
  442. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  443. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  444. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  445. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  446. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  447. vllm/model_executor/layers/fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H200,dtype=int8_w8a16.json +146 -0
  448. vllm/model_executor/layers/fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  449. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  450. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
  451. vllm/model_executor/layers/fused_moe/configs/E=16,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  452. vllm/model_executor/layers/fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  453. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
  454. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  455. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  456. vllm/model_executor/layers/fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
  457. vllm/model_executor/layers/fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
  458. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI300X.json +201 -0
  459. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  460. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  461. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  462. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H20-3e.json +146 -0
  463. vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +147 -0
  464. vllm/model_executor/layers/fused_moe/configs/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  465. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  466. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI350_OAM,dtype=fp8_w8a8.json +164 -0
  467. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=AMD_Instinct_MI355_OAM,dtype=fp8_w8a8.json +164 -0
  468. vllm/model_executor/layers/fused_moe/configs/E=160,N=384,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  469. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  470. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  471. vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  472. vllm/model_executor/layers/fused_moe/configs/E=160,N=768,device_name=NVIDIA_B300_SXM6_AC,dtype=fp8_w8a8.json +147 -0
  473. vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json +147 -0
  474. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  475. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  476. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  477. vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  478. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json +200 -0
  479. vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  480. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  481. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  482. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  483. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  484. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  485. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  486. vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  487. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  488. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  489. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  490. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  491. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  492. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  493. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  494. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  495. vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  496. vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  497. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +200 -0
  498. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  499. vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  500. vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  501. vllm/model_executor/layers/fused_moe/configs/E=32,N=1408,device_name=NVIDIA_B200.json +147 -0
  502. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  503. vllm/model_executor/layers/fused_moe/configs/E=32,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  504. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  505. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  506. vllm/model_executor/layers/fused_moe/configs/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  507. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  508. vllm/model_executor/layers/fused_moe/configs/E=384,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  509. vllm/model_executor/layers/fused_moe/configs/E=40,N=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +147 -0
  510. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  511. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  512. vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  513. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  514. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  515. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_B200.json +146 -0
  516. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +147 -0
  517. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  518. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  519. vllm/model_executor/layers/fused_moe/configs/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  520. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  521. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  522. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  523. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  524. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  525. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  526. vllm/model_executor/layers/fused_moe/configs/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  527. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  528. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_B200.json +146 -0
  529. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_GB200,dtype=fp8_w8a8.json +146 -0
  530. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  531. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H20-3e.json +146 -0
  532. vllm/model_executor/layers/fused_moe/configs/E=512,N=512,device_name=NVIDIA_H200.json +146 -0
  533. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +147 -0
  534. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_B200.json +146 -0
  535. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H20-3e.json +146 -0
  536. vllm/model_executor/layers/fused_moe/configs/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  537. vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json +200 -0
  538. vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json +200 -0
  539. vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json +200 -0
  540. vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json +200 -0
  541. vllm/model_executor/layers/fused_moe/configs/E=62,N=128,device_name=AMD_Instinct_MI300X.json +200 -0
  542. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=AMD_Instinct_MI300X.json +200 -0
  543. vllm/model_executor/layers/fused_moe/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  544. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=AMD_Instinct_MI300X.json +200 -0
  545. vllm/model_executor/layers/fused_moe/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  546. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  547. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  548. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  549. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  550. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  551. vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
  552. vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_B200.json +147 -0
  553. vllm/model_executor/layers/fused_moe/configs/E=64,N=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  554. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  555. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  556. vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
  557. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  558. vllm/model_executor/layers/fused_moe/configs/E=64,N=3072,device_name=NVIDIA_H20.json +146 -0
  559. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  560. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  561. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  562. vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
  563. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  564. vllm/model_executor/layers/fused_moe/configs/E=64,N=384,device_name=NVIDIA_H20.json +146 -0
  565. vllm/model_executor/layers/fused_moe/configs/E=64,N=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  566. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  567. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  568. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  569. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  570. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  571. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  572. vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
  573. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H100_PCIe,dtype=fp8_w8a8,block_shape=[128,128].json +147 -0
  574. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8.json +146 -0
  575. vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=NVIDIA_H20.json +146 -0
  576. vllm/model_executor/layers/fused_moe/configs/E=64,N=896,device_name=NVIDIA_H20.json +146 -0
  577. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=bf16.json +82 -0
  578. vllm/model_executor/layers/fused_moe/configs/E=64,N=8960,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +82 -0
  579. vllm/model_executor/layers/fused_moe/configs/E=72,N=192,device_name=AMD_Instinct_MI300X.json +200 -0
  580. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=AMD_Instinct_MI300X.json +200 -0
  581. vllm/model_executor/layers/fused_moe/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  582. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=AMD_Instinct_MI300X.json +200 -0
  583. vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  584. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  585. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
  586. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  587. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +200 -0
  588. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
  589. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  590. vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
  591. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  592. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json +200 -0
  593. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  594. vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json +200 -0
  595. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  596. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
  597. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  598. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +200 -0
  599. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  600. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  601. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  602. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  603. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
  604. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  605. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json +200 -0
  606. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  607. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json +200 -0
  608. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  609. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  610. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  611. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +154 -0
  612. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  613. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
  614. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  615. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
  616. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  617. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +200 -0
  618. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  619. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  620. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
  621. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  622. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  623. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  624. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
  625. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
  626. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  627. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json +200 -0
  628. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  629. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json +200 -0
  630. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  631. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  632. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  633. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  634. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
  635. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  636. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
  637. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  638. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +200 -0
  639. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  640. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  641. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  642. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  643. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
  644. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +164 -0
  645. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json +200 -0
  646. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +164 -0
  647. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json +200 -0
  648. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
  649. vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  650. vllm/model_executor/layers/fused_moe/configs/README +12 -0
  651. vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +444 -0
  652. vllm/model_executor/layers/fused_moe/cutlass_moe.py +1086 -0
  653. vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +364 -0
  654. vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +427 -0
  655. vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +420 -0
  656. vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +436 -0
  657. vllm/model_executor/layers/fused_moe/fallback.py +127 -0
  658. vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +338 -0
  659. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +310 -0
  660. vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +371 -0
  661. vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +192 -0
  662. vllm/model_executor/layers/fused_moe/fused_batched_moe.py +1018 -0
  663. vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +824 -0
  664. vllm/model_executor/layers/fused_moe/fused_moe.py +2638 -0
  665. vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +119 -0
  666. vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +117 -0
  667. vllm/model_executor/layers/fused_moe/fused_moe_router.py +40 -0
  668. vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +531 -0
  669. vllm/model_executor/layers/fused_moe/layer.py +2169 -0
  670. vllm/model_executor/layers/fused_moe/modular_kernel.py +1251 -0
  671. vllm/model_executor/layers/fused_moe/moe_align_block_size.py +192 -0
  672. vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +229 -0
  673. vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +60 -0
  674. vllm/model_executor/layers/fused_moe/oracle/__init__.py +2 -0
  675. vllm/model_executor/layers/fused_moe/oracle/fp8.py +358 -0
  676. vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +280 -0
  677. vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +362 -0
  678. vllm/model_executor/layers/fused_moe/prepare_finalize.py +87 -0
  679. vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +347 -0
  680. vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +324 -0
  681. vllm/model_executor/layers/fused_moe/routing_simulator.py +310 -0
  682. vllm/model_executor/layers/fused_moe/shared_fused_moe.py +96 -0
  683. vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py +171 -0
  684. vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py +78 -0
  685. vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +75 -0
  686. vllm/model_executor/layers/fused_moe/trtllm_moe.py +144 -0
  687. vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +403 -0
  688. vllm/model_executor/layers/fused_moe/utils.py +382 -0
  689. vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +189 -0
  690. vllm/model_executor/layers/kda.py +442 -0
  691. vllm/model_executor/layers/layernorm.py +451 -0
  692. vllm/model_executor/layers/lightning_attn.py +735 -0
  693. vllm/model_executor/layers/linear.py +1478 -0
  694. vllm/model_executor/layers/logits_processor.py +109 -0
  695. vllm/model_executor/layers/mamba/__init__.py +0 -0
  696. vllm/model_executor/layers/mamba/abstract.py +68 -0
  697. vllm/model_executor/layers/mamba/linear_attn.py +410 -0
  698. vllm/model_executor/layers/mamba/mamba_mixer.py +541 -0
  699. vllm/model_executor/layers/mamba/mamba_mixer2.py +936 -0
  700. vllm/model_executor/layers/mamba/mamba_utils.py +225 -0
  701. vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
  702. vllm/model_executor/layers/mamba/ops/causal_conv1d.py +1240 -0
  703. vllm/model_executor/layers/mamba/ops/layernorm_gated.py +172 -0
  704. vllm/model_executor/layers/mamba/ops/mamba_ssm.py +586 -0
  705. vllm/model_executor/layers/mamba/ops/ssd_bmm.py +211 -0
  706. vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +456 -0
  707. vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +700 -0
  708. vllm/model_executor/layers/mamba/ops/ssd_combined.py +230 -0
  709. vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +157 -0
  710. vllm/model_executor/layers/mamba/short_conv.py +254 -0
  711. vllm/model_executor/layers/mla.py +179 -0
  712. vllm/model_executor/layers/pooler/__init__.py +5 -0
  713. vllm/model_executor/layers/pooler/abstract.py +39 -0
  714. vllm/model_executor/layers/pooler/activations.py +162 -0
  715. vllm/model_executor/layers/pooler/common.py +32 -0
  716. vllm/model_executor/layers/pooler/seqwise/__init__.py +45 -0
  717. vllm/model_executor/layers/pooler/seqwise/heads.py +151 -0
  718. vllm/model_executor/layers/pooler/seqwise/methods.py +93 -0
  719. vllm/model_executor/layers/pooler/seqwise/poolers.py +127 -0
  720. vllm/model_executor/layers/pooler/special.py +128 -0
  721. vllm/model_executor/layers/pooler/tokwise/__init__.py +39 -0
  722. vllm/model_executor/layers/pooler/tokwise/heads.py +133 -0
  723. vllm/model_executor/layers/pooler/tokwise/methods.py +122 -0
  724. vllm/model_executor/layers/pooler/tokwise/poolers.py +127 -0
  725. vllm/model_executor/layers/quantization/__init__.py +195 -0
  726. vllm/model_executor/layers/quantization/auto_round.py +454 -0
  727. vllm/model_executor/layers/quantization/awq.py +277 -0
  728. vllm/model_executor/layers/quantization/awq_marlin.py +795 -0
  729. vllm/model_executor/layers/quantization/awq_triton.py +337 -0
  730. vllm/model_executor/layers/quantization/base_config.py +170 -0
  731. vllm/model_executor/layers/quantization/bitblas.py +502 -0
  732. vllm/model_executor/layers/quantization/bitsandbytes.py +631 -0
  733. vllm/model_executor/layers/quantization/compressed_tensors/__init__.py +3 -0
  734. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +982 -0
  735. vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2368 -0
  736. vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +37 -0
  737. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +392 -0
  738. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +55 -0
  739. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +176 -0
  740. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_mxfp4.py +106 -0
  741. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +124 -0
  742. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +218 -0
  743. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +176 -0
  744. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py +153 -0
  745. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +138 -0
  746. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +203 -0
  747. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +125 -0
  748. vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +230 -0
  749. vllm/model_executor/layers/quantization/compressed_tensors/transform/__init__.py +0 -0
  750. vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py +260 -0
  751. vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +173 -0
  752. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py +0 -0
  753. vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py +64 -0
  754. vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py +13 -0
  755. vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +224 -0
  756. vllm/model_executor/layers/quantization/compressed_tensors/utils.py +216 -0
  757. vllm/model_executor/layers/quantization/cpu_wna16.py +299 -0
  758. vllm/model_executor/layers/quantization/deepspeedfp.py +218 -0
  759. vllm/model_executor/layers/quantization/experts_int8.py +209 -0
  760. vllm/model_executor/layers/quantization/fbgemm_fp8.py +195 -0
  761. vllm/model_executor/layers/quantization/fp8.py +1224 -0
  762. vllm/model_executor/layers/quantization/fp_quant.py +420 -0
  763. vllm/model_executor/layers/quantization/gguf.py +682 -0
  764. vllm/model_executor/layers/quantization/gptq.py +393 -0
  765. vllm/model_executor/layers/quantization/gptq_bitblas.py +482 -0
  766. vllm/model_executor/layers/quantization/gptq_marlin.py +934 -0
  767. vllm/model_executor/layers/quantization/gptq_marlin_24.py +320 -0
  768. vllm/model_executor/layers/quantization/hqq_marlin.py +372 -0
  769. vllm/model_executor/layers/quantization/inc.py +65 -0
  770. vllm/model_executor/layers/quantization/input_quant_fp8.py +212 -0
  771. vllm/model_executor/layers/quantization/ipex_quant.py +403 -0
  772. vllm/model_executor/layers/quantization/kernels/__init__.py +0 -0
  773. vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +94 -0
  774. vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +113 -0
  775. vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +115 -0
  776. vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +323 -0
  777. vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py +98 -0
  778. vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py +126 -0
  779. vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +130 -0
  780. vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py +111 -0
  781. vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +168 -0
  782. vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +159 -0
  783. vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +200 -0
  784. vllm/model_executor/layers/quantization/kernels/mixed_precision/xpu.py +97 -0
  785. vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +76 -0
  786. vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +77 -0
  787. vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +128 -0
  788. vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +220 -0
  789. vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +147 -0
  790. vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +88 -0
  791. vllm/model_executor/layers/quantization/kv_cache.py +153 -0
  792. vllm/model_executor/layers/quantization/modelopt.py +1665 -0
  793. vllm/model_executor/layers/quantization/moe_wna16.py +518 -0
  794. vllm/model_executor/layers/quantization/mxfp4.py +1145 -0
  795. vllm/model_executor/layers/quantization/petit.py +319 -0
  796. vllm/model_executor/layers/quantization/ptpc_fp8.py +140 -0
  797. vllm/model_executor/layers/quantization/quark/__init__.py +0 -0
  798. vllm/model_executor/layers/quantization/quark/quark.py +570 -0
  799. vllm/model_executor/layers/quantization/quark/quark_moe.py +797 -0
  800. vllm/model_executor/layers/quantization/quark/schemes/__init__.py +9 -0
  801. vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +343 -0
  802. vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  803. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +179 -0
  804. vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +139 -0
  805. vllm/model_executor/layers/quantization/quark/utils.py +105 -0
  806. vllm/model_executor/layers/quantization/qutlass_utils.py +185 -0
  807. vllm/model_executor/layers/quantization/rtn.py +626 -0
  808. vllm/model_executor/layers/quantization/schema.py +90 -0
  809. vllm/model_executor/layers/quantization/torchao.py +380 -0
  810. vllm/model_executor/layers/quantization/utils/__init__.py +6 -0
  811. vllm/model_executor/layers/quantization/utils/allspark_utils.py +67 -0
  812. vllm/model_executor/layers/quantization/utils/bitblas_utils.py +229 -0
  813. vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  814. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  815. vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  816. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  817. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  818. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  819. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  820. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  821. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  822. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  823. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  824. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  825. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  826. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  827. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  828. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  829. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  830. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  831. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  832. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  833. vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  834. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  835. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  836. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  837. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  838. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  839. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  840. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  841. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  842. vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  843. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  844. vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  845. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  846. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  847. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  848. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  849. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  850. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  851. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  852. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  853. vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  854. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  855. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  856. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  857. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  858. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  859. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  860. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  861. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  862. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  863. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  864. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  865. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  866. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  867. vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  868. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  869. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  870. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  871. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  872. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  873. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  874. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  875. vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  876. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  877. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  878. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  879. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  880. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  881. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  882. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  883. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  884. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  885. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  886. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  887. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  888. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  889. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  890. vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  891. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  892. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  893. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  894. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  895. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  896. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  897. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  898. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  899. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  900. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  901. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  902. vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  903. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  904. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  905. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  906. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  907. vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  908. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  909. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  910. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  911. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  912. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  913. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  914. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  915. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  916. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  917. vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  918. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  919. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  920. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  921. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  922. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  923. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  924. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  925. vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  926. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  927. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  928. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  929. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  930. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  931. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  932. vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  933. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  934. vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  935. vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  936. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  937. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  938. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  939. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  940. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  941. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  942. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  943. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  944. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  945. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  946. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +18 -0
  947. vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  948. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  949. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  950. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  951. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  952. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  953. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  954. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  955. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  956. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  957. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  958. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  959. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  960. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  961. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  962. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  963. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  964. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  965. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  966. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  967. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  968. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  969. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  970. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  971. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  972. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  973. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  974. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  975. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  976. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  977. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  978. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  979. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  980. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  981. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  982. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  983. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  984. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  985. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  986. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  987. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  988. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  989. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  990. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  991. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  992. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  993. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  994. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  995. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  996. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  997. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  998. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  999. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1000. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1001. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1002. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1003. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1004. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1005. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1006. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1007. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1008. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1009. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1010. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1011. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1012. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1013. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1014. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1015. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1016. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1017. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1018. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json +146 -0
  1019. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
  1020. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json +26 -0
  1021. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1022. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1023. vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1024. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1025. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1026. vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +164 -0
  1027. vllm/model_executor/layers/quantization/utils/configs/README.md +3 -0
  1028. vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +514 -0
  1029. vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +370 -0
  1030. vllm/model_executor/layers/quantization/utils/fp8_utils.py +1658 -0
  1031. vllm/model_executor/layers/quantization/utils/gptq_utils.py +158 -0
  1032. vllm/model_executor/layers/quantization/utils/int8_utils.py +477 -0
  1033. vllm/model_executor/layers/quantization/utils/layer_utils.py +41 -0
  1034. vllm/model_executor/layers/quantization/utils/machete_utils.py +56 -0
  1035. vllm/model_executor/layers/quantization/utils/marlin_utils.py +720 -0
  1036. vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +565 -0
  1037. vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +378 -0
  1038. vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +219 -0
  1039. vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +467 -0
  1040. vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +189 -0
  1041. vllm/model_executor/layers/quantization/utils/mxfp6_utils.py +142 -0
  1042. vllm/model_executor/layers/quantization/utils/mxfp8_utils.py +24 -0
  1043. vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +142 -0
  1044. vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +67 -0
  1045. vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +51 -0
  1046. vllm/model_executor/layers/quantization/utils/petit_utils.py +124 -0
  1047. vllm/model_executor/layers/quantization/utils/quant_utils.py +767 -0
  1048. vllm/model_executor/layers/quantization/utils/w8a8_utils.py +519 -0
  1049. vllm/model_executor/layers/resampler.py +283 -0
  1050. vllm/model_executor/layers/rotary_embedding/__init__.py +291 -0
  1051. vllm/model_executor/layers/rotary_embedding/base.py +282 -0
  1052. vllm/model_executor/layers/rotary_embedding/common.py +289 -0
  1053. vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +184 -0
  1054. vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +218 -0
  1055. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py +43 -0
  1056. vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py +68 -0
  1057. vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py +82 -0
  1058. vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py +115 -0
  1059. vllm/model_executor/layers/rotary_embedding/llama3_rope.py +54 -0
  1060. vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +83 -0
  1061. vllm/model_executor/layers/rotary_embedding/mrope.py +412 -0
  1062. vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py +47 -0
  1063. vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py +159 -0
  1064. vllm/model_executor/layers/rotary_embedding/xdrope.py +160 -0
  1065. vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +84 -0
  1066. vllm/model_executor/layers/utils.py +251 -0
  1067. vllm/model_executor/layers/vocab_parallel_embedding.py +564 -0
  1068. vllm/model_executor/model_loader/__init__.py +150 -0
  1069. vllm/model_executor/model_loader/base_loader.py +71 -0
  1070. vllm/model_executor/model_loader/bitsandbytes_loader.py +821 -0
  1071. vllm/model_executor/model_loader/default_loader.py +304 -0
  1072. vllm/model_executor/model_loader/dummy_loader.py +28 -0
  1073. vllm/model_executor/model_loader/gguf_loader.py +371 -0
  1074. vllm/model_executor/model_loader/online_quantization.py +275 -0
  1075. vllm/model_executor/model_loader/runai_streamer_loader.py +115 -0
  1076. vllm/model_executor/model_loader/sharded_state_loader.py +214 -0
  1077. vllm/model_executor/model_loader/tensorizer.py +793 -0
  1078. vllm/model_executor/model_loader/tensorizer_loader.py +151 -0
  1079. vllm/model_executor/model_loader/utils.py +299 -0
  1080. vllm/model_executor/model_loader/weight_utils.py +1183 -0
  1081. vllm/model_executor/models/__init__.py +44 -0
  1082. vllm/model_executor/models/adapters.py +592 -0
  1083. vllm/model_executor/models/afmoe.py +697 -0
  1084. vllm/model_executor/models/aimv2.py +248 -0
  1085. vllm/model_executor/models/apertus.py +567 -0
  1086. vllm/model_executor/models/arcee.py +428 -0
  1087. vllm/model_executor/models/arctic.py +633 -0
  1088. vllm/model_executor/models/aria.py +663 -0
  1089. vllm/model_executor/models/audioflamingo3.py +639 -0
  1090. vllm/model_executor/models/aya_vision.py +448 -0
  1091. vllm/model_executor/models/bagel.py +591 -0
  1092. vllm/model_executor/models/baichuan.py +493 -0
  1093. vllm/model_executor/models/bailing_moe.py +643 -0
  1094. vllm/model_executor/models/bamba.py +511 -0
  1095. vllm/model_executor/models/bee.py +157 -0
  1096. vllm/model_executor/models/bert.py +911 -0
  1097. vllm/model_executor/models/bert_with_rope.py +729 -0
  1098. vllm/model_executor/models/blip.py +350 -0
  1099. vllm/model_executor/models/blip2.py +736 -0
  1100. vllm/model_executor/models/bloom.py +390 -0
  1101. vllm/model_executor/models/chameleon.py +1095 -0
  1102. vllm/model_executor/models/chatglm.py +502 -0
  1103. vllm/model_executor/models/clip.py +1045 -0
  1104. vllm/model_executor/models/cohere2_vision.py +470 -0
  1105. vllm/model_executor/models/commandr.py +469 -0
  1106. vllm/model_executor/models/config.py +571 -0
  1107. vllm/model_executor/models/dbrx.py +484 -0
  1108. vllm/model_executor/models/deepencoder.py +679 -0
  1109. vllm/model_executor/models/deepseek_eagle.py +253 -0
  1110. vllm/model_executor/models/deepseek_mtp.py +447 -0
  1111. vllm/model_executor/models/deepseek_ocr.py +601 -0
  1112. vllm/model_executor/models/deepseek_v2.py +1727 -0
  1113. vllm/model_executor/models/deepseek_vl2.py +642 -0
  1114. vllm/model_executor/models/dots1.py +566 -0
  1115. vllm/model_executor/models/dots_ocr.py +830 -0
  1116. vllm/model_executor/models/ernie45.py +53 -0
  1117. vllm/model_executor/models/ernie45_moe.py +755 -0
  1118. vllm/model_executor/models/ernie45_vl.py +1702 -0
  1119. vllm/model_executor/models/ernie45_vl_moe.py +801 -0
  1120. vllm/model_executor/models/ernie_mtp.py +278 -0
  1121. vllm/model_executor/models/exaone.py +524 -0
  1122. vllm/model_executor/models/exaone4.py +518 -0
  1123. vllm/model_executor/models/exaone_moe.py +579 -0
  1124. vllm/model_executor/models/exaone_moe_mtp.py +255 -0
  1125. vllm/model_executor/models/fairseq2_llama.py +154 -0
  1126. vllm/model_executor/models/falcon.py +543 -0
  1127. vllm/model_executor/models/falcon_h1.py +675 -0
  1128. vllm/model_executor/models/flex_olmo.py +155 -0
  1129. vllm/model_executor/models/fuyu.py +371 -0
  1130. vllm/model_executor/models/gemma.py +425 -0
  1131. vllm/model_executor/models/gemma2.py +435 -0
  1132. vllm/model_executor/models/gemma3.py +520 -0
  1133. vllm/model_executor/models/gemma3_mm.py +664 -0
  1134. vllm/model_executor/models/gemma3n.py +1166 -0
  1135. vllm/model_executor/models/gemma3n_audio_utils.py +57 -0
  1136. vllm/model_executor/models/gemma3n_mm.py +820 -0
  1137. vllm/model_executor/models/glm.py +24 -0
  1138. vllm/model_executor/models/glm4.py +295 -0
  1139. vllm/model_executor/models/glm4_1v.py +1823 -0
  1140. vllm/model_executor/models/glm4_moe.py +725 -0
  1141. vllm/model_executor/models/glm4_moe_mtp.py +365 -0
  1142. vllm/model_executor/models/glm4v.py +783 -0
  1143. vllm/model_executor/models/glmasr.py +1154 -0
  1144. vllm/model_executor/models/glmasr_utils.py +188 -0
  1145. vllm/model_executor/models/gpt2.py +385 -0
  1146. vllm/model_executor/models/gpt_bigcode.py +339 -0
  1147. vllm/model_executor/models/gpt_j.py +346 -0
  1148. vllm/model_executor/models/gpt_neox.py +340 -0
  1149. vllm/model_executor/models/gpt_oss.py +745 -0
  1150. vllm/model_executor/models/granite.py +475 -0
  1151. vllm/model_executor/models/granite_speech.py +919 -0
  1152. vllm/model_executor/models/granitemoe.py +561 -0
  1153. vllm/model_executor/models/granitemoehybrid.py +703 -0
  1154. vllm/model_executor/models/granitemoeshared.py +328 -0
  1155. vllm/model_executor/models/gritlm.py +242 -0
  1156. vllm/model_executor/models/grok1.py +803 -0
  1157. vllm/model_executor/models/h2ovl.py +554 -0
  1158. vllm/model_executor/models/hunyuan_v1.py +1042 -0
  1159. vllm/model_executor/models/hunyuan_vision.py +1034 -0
  1160. vllm/model_executor/models/hyperclovax_vision.py +1163 -0
  1161. vllm/model_executor/models/idefics2_vision_model.py +427 -0
  1162. vllm/model_executor/models/idefics3.py +734 -0
  1163. vllm/model_executor/models/interfaces.py +1180 -0
  1164. vllm/model_executor/models/interfaces_base.py +252 -0
  1165. vllm/model_executor/models/intern_vit.py +454 -0
  1166. vllm/model_executor/models/internlm2.py +451 -0
  1167. vllm/model_executor/models/internlm2_ve.py +139 -0
  1168. vllm/model_executor/models/interns1.py +828 -0
  1169. vllm/model_executor/models/interns1_vit.py +433 -0
  1170. vllm/model_executor/models/internvl.py +1436 -0
  1171. vllm/model_executor/models/iquest_loopcoder.py +595 -0
  1172. vllm/model_executor/models/isaac.py +1503 -0
  1173. vllm/model_executor/models/jais.py +397 -0
  1174. vllm/model_executor/models/jais2.py +508 -0
  1175. vllm/model_executor/models/jamba.py +599 -0
  1176. vllm/model_executor/models/jina_vl.py +145 -0
  1177. vllm/model_executor/models/kanana_v.py +756 -0
  1178. vllm/model_executor/models/keye.py +1709 -0
  1179. vllm/model_executor/models/keye_vl1_5.py +726 -0
  1180. vllm/model_executor/models/kimi_linear.py +659 -0
  1181. vllm/model_executor/models/kimi_vl.py +577 -0
  1182. vllm/model_executor/models/lfm2.py +515 -0
  1183. vllm/model_executor/models/lfm2_moe.py +746 -0
  1184. vllm/model_executor/models/lfm2_vl.py +732 -0
  1185. vllm/model_executor/models/lightonocr.py +197 -0
  1186. vllm/model_executor/models/llama.py +724 -0
  1187. vllm/model_executor/models/llama4.py +860 -0
  1188. vllm/model_executor/models/llama4_eagle.py +225 -0
  1189. vllm/model_executor/models/llama_eagle.py +213 -0
  1190. vllm/model_executor/models/llama_eagle3.py +375 -0
  1191. vllm/model_executor/models/llava.py +879 -0
  1192. vllm/model_executor/models/llava_next.py +583 -0
  1193. vllm/model_executor/models/llava_next_video.py +467 -0
  1194. vllm/model_executor/models/llava_onevision.py +922 -0
  1195. vllm/model_executor/models/longcat_flash.py +767 -0
  1196. vllm/model_executor/models/longcat_flash_mtp.py +348 -0
  1197. vllm/model_executor/models/mamba.py +276 -0
  1198. vllm/model_executor/models/mamba2.py +288 -0
  1199. vllm/model_executor/models/medusa.py +179 -0
  1200. vllm/model_executor/models/midashenglm.py +826 -0
  1201. vllm/model_executor/models/mimo.py +188 -0
  1202. vllm/model_executor/models/mimo_mtp.py +294 -0
  1203. vllm/model_executor/models/mimo_v2_flash.py +718 -0
  1204. vllm/model_executor/models/minicpm.py +660 -0
  1205. vllm/model_executor/models/minicpm3.py +233 -0
  1206. vllm/model_executor/models/minicpm_eagle.py +386 -0
  1207. vllm/model_executor/models/minicpmo.py +768 -0
  1208. vllm/model_executor/models/minicpmv.py +1742 -0
  1209. vllm/model_executor/models/minimax_m2.py +552 -0
  1210. vllm/model_executor/models/minimax_text_01.py +1008 -0
  1211. vllm/model_executor/models/minimax_vl_01.py +395 -0
  1212. vllm/model_executor/models/mistral3.py +638 -0
  1213. vllm/model_executor/models/mistral_large_3.py +63 -0
  1214. vllm/model_executor/models/mistral_large_3_eagle.py +137 -0
  1215. vllm/model_executor/models/mixtral.py +599 -0
  1216. vllm/model_executor/models/mllama4.py +1170 -0
  1217. vllm/model_executor/models/mlp_speculator.py +235 -0
  1218. vllm/model_executor/models/modernbert.py +458 -0
  1219. vllm/model_executor/models/module_mapping.py +74 -0
  1220. vllm/model_executor/models/molmo.py +1592 -0
  1221. vllm/model_executor/models/moonvit.py +601 -0
  1222. vllm/model_executor/models/mpt.py +335 -0
  1223. vllm/model_executor/models/nano_nemotron_vl.py +1725 -0
  1224. vllm/model_executor/models/nemotron.py +499 -0
  1225. vllm/model_executor/models/nemotron_h.py +902 -0
  1226. vllm/model_executor/models/nemotron_nas.py +474 -0
  1227. vllm/model_executor/models/nemotron_parse.py +958 -0
  1228. vllm/model_executor/models/nemotron_vl.py +651 -0
  1229. vllm/model_executor/models/nvlm_d.py +216 -0
  1230. vllm/model_executor/models/olmo.py +412 -0
  1231. vllm/model_executor/models/olmo2.py +454 -0
  1232. vllm/model_executor/models/olmoe.py +498 -0
  1233. vllm/model_executor/models/opencua.py +262 -0
  1234. vllm/model_executor/models/openpangu.py +1378 -0
  1235. vllm/model_executor/models/openpangu_mtp.py +265 -0
  1236. vllm/model_executor/models/opt.py +426 -0
  1237. vllm/model_executor/models/orion.py +365 -0
  1238. vllm/model_executor/models/ouro.py +507 -0
  1239. vllm/model_executor/models/ovis.py +557 -0
  1240. vllm/model_executor/models/ovis2_5.py +661 -0
  1241. vllm/model_executor/models/paddleocr_vl.py +1261 -0
  1242. vllm/model_executor/models/paligemma.py +429 -0
  1243. vllm/model_executor/models/persimmon.py +373 -0
  1244. vllm/model_executor/models/phi.py +363 -0
  1245. vllm/model_executor/models/phi3.py +18 -0
  1246. vllm/model_executor/models/phi3v.py +729 -0
  1247. vllm/model_executor/models/phi4mm.py +1250 -0
  1248. vllm/model_executor/models/phi4mm_audio.py +1296 -0
  1249. vllm/model_executor/models/phi4mm_utils.py +1907 -0
  1250. vllm/model_executor/models/phimoe.py +671 -0
  1251. vllm/model_executor/models/pixtral.py +1437 -0
  1252. vllm/model_executor/models/plamo2.py +993 -0
  1253. vllm/model_executor/models/plamo3.py +437 -0
  1254. vllm/model_executor/models/qwen.py +377 -0
  1255. vllm/model_executor/models/qwen2.py +600 -0
  1256. vllm/model_executor/models/qwen2_5_omni_thinker.py +1200 -0
  1257. vllm/model_executor/models/qwen2_5_vl.py +1598 -0
  1258. vllm/model_executor/models/qwen2_audio.py +478 -0
  1259. vllm/model_executor/models/qwen2_moe.py +604 -0
  1260. vllm/model_executor/models/qwen2_rm.py +120 -0
  1261. vllm/model_executor/models/qwen2_vl.py +1588 -0
  1262. vllm/model_executor/models/qwen3.py +331 -0
  1263. vllm/model_executor/models/qwen3_moe.py +752 -0
  1264. vllm/model_executor/models/qwen3_next.py +1410 -0
  1265. vllm/model_executor/models/qwen3_next_mtp.py +293 -0
  1266. vllm/model_executor/models/qwen3_omni_moe_thinker.py +1814 -0
  1267. vllm/model_executor/models/qwen3_vl.py +2120 -0
  1268. vllm/model_executor/models/qwen3_vl_moe.py +474 -0
  1269. vllm/model_executor/models/qwen_vl.py +821 -0
  1270. vllm/model_executor/models/radio.py +573 -0
  1271. vllm/model_executor/models/registry.py +1218 -0
  1272. vllm/model_executor/models/roberta.py +239 -0
  1273. vllm/model_executor/models/rvl.py +107 -0
  1274. vllm/model_executor/models/seed_oss.py +492 -0
  1275. vllm/model_executor/models/siglip.py +1259 -0
  1276. vllm/model_executor/models/siglip2.py +495 -0
  1277. vllm/model_executor/models/siglip2navit.py +660 -0
  1278. vllm/model_executor/models/skyworkr1v.py +951 -0
  1279. vllm/model_executor/models/smolvlm.py +38 -0
  1280. vllm/model_executor/models/solar.py +484 -0
  1281. vllm/model_executor/models/stablelm.py +354 -0
  1282. vllm/model_executor/models/starcoder2.py +365 -0
  1283. vllm/model_executor/models/step3_text.py +554 -0
  1284. vllm/model_executor/models/step3_vl.py +1147 -0
  1285. vllm/model_executor/models/swin.py +500 -0
  1286. vllm/model_executor/models/tarsier.py +624 -0
  1287. vllm/model_executor/models/telechat2.py +153 -0
  1288. vllm/model_executor/models/teleflm.py +78 -0
  1289. vllm/model_executor/models/terratorch.py +318 -0
  1290. vllm/model_executor/models/transformers/__init__.py +127 -0
  1291. vllm/model_executor/models/transformers/base.py +523 -0
  1292. vllm/model_executor/models/transformers/causal.py +65 -0
  1293. vllm/model_executor/models/transformers/legacy.py +90 -0
  1294. vllm/model_executor/models/transformers/moe.py +329 -0
  1295. vllm/model_executor/models/transformers/multimodal.py +441 -0
  1296. vllm/model_executor/models/transformers/pooling.py +102 -0
  1297. vllm/model_executor/models/transformers/utils.py +253 -0
  1298. vllm/model_executor/models/ultravox.py +786 -0
  1299. vllm/model_executor/models/utils.py +832 -0
  1300. vllm/model_executor/models/vision.py +546 -0
  1301. vllm/model_executor/models/voxtral.py +867 -0
  1302. vllm/model_executor/models/voxtral_streaming.py +304 -0
  1303. vllm/model_executor/models/whisper.py +993 -0
  1304. vllm/model_executor/models/whisper_utils.py +299 -0
  1305. vllm/model_executor/models/zamba2.py +986 -0
  1306. vllm/model_executor/parameter.py +642 -0
  1307. vllm/model_executor/utils.py +113 -0
  1308. vllm/model_executor/warmup/__init__.py +0 -0
  1309. vllm/model_executor/warmup/deep_gemm_warmup.py +371 -0
  1310. vllm/model_executor/warmup/kernel_warmup.py +97 -0
  1311. vllm/model_inspection.py +136 -0
  1312. vllm/multimodal/__init__.py +38 -0
  1313. vllm/multimodal/audio.py +287 -0
  1314. vllm/multimodal/base.py +60 -0
  1315. vllm/multimodal/cache.py +829 -0
  1316. vllm/multimodal/evs.py +294 -0
  1317. vllm/multimodal/hasher.py +123 -0
  1318. vllm/multimodal/image.py +155 -0
  1319. vllm/multimodal/inputs.py +1027 -0
  1320. vllm/multimodal/parse.py +674 -0
  1321. vllm/multimodal/processing.py +2469 -0
  1322. vllm/multimodal/profiling.py +351 -0
  1323. vllm/multimodal/registry.py +375 -0
  1324. vllm/multimodal/utils.py +550 -0
  1325. vllm/multimodal/video.py +512 -0
  1326. vllm/outputs.py +347 -0
  1327. vllm/platforms/__init__.py +277 -0
  1328. vllm/platforms/cpu.py +423 -0
  1329. vllm/platforms/cuda.py +618 -0
  1330. vllm/platforms/interface.py +707 -0
  1331. vllm/platforms/rocm.py +586 -0
  1332. vllm/platforms/tpu.py +20 -0
  1333. vllm/platforms/xpu.py +262 -0
  1334. vllm/plugins/__init__.py +81 -0
  1335. vllm/plugins/io_processors/__init__.py +68 -0
  1336. vllm/plugins/io_processors/interface.py +77 -0
  1337. vllm/plugins/lora_resolvers/__init__.py +0 -0
  1338. vllm/plugins/lora_resolvers/filesystem_resolver.py +52 -0
  1339. vllm/pooling_params.py +229 -0
  1340. vllm/profiler/__init__.py +0 -0
  1341. vllm/profiler/layerwise_profile.py +392 -0
  1342. vllm/profiler/utils.py +151 -0
  1343. vllm/profiler/wrapper.py +241 -0
  1344. vllm/py.typed +2 -0
  1345. vllm/ray/__init__.py +0 -0
  1346. vllm/ray/lazy_utils.py +30 -0
  1347. vllm/ray/ray_env.py +79 -0
  1348. vllm/reasoning/__init__.py +96 -0
  1349. vllm/reasoning/abs_reasoning_parsers.py +318 -0
  1350. vllm/reasoning/basic_parsers.py +175 -0
  1351. vllm/reasoning/deepseek_r1_reasoning_parser.py +67 -0
  1352. vllm/reasoning/deepseek_v3_reasoning_parser.py +69 -0
  1353. vllm/reasoning/ernie45_reasoning_parser.py +165 -0
  1354. vllm/reasoning/glm4_moe_reasoning_parser.py +13 -0
  1355. vllm/reasoning/gptoss_reasoning_parser.py +173 -0
  1356. vllm/reasoning/granite_reasoning_parser.py +363 -0
  1357. vllm/reasoning/holo2_reasoning_parser.py +89 -0
  1358. vllm/reasoning/hunyuan_a13b_reasoning_parser.py +237 -0
  1359. vllm/reasoning/identity_reasoning_parser.py +63 -0
  1360. vllm/reasoning/minimax_m2_reasoning_parser.py +110 -0
  1361. vllm/reasoning/mistral_reasoning_parser.py +154 -0
  1362. vllm/reasoning/olmo3_reasoning_parser.py +302 -0
  1363. vllm/reasoning/qwen3_reasoning_parser.py +67 -0
  1364. vllm/reasoning/seedoss_reasoning_parser.py +27 -0
  1365. vllm/reasoning/step3_reasoning_parser.py +113 -0
  1366. vllm/sampling_params.py +629 -0
  1367. vllm/scalar_type.py +355 -0
  1368. vllm/scripts.py +17 -0
  1369. vllm/sequence.py +64 -0
  1370. vllm/tasks.py +13 -0
  1371. vllm/third_party/__init__.py +0 -0
  1372. vllm/third_party/pynvml.py +6140 -0
  1373. vllm/tokenizers/__init__.py +18 -0
  1374. vllm/tokenizers/deepseek_v32.py +187 -0
  1375. vllm/tokenizers/deepseek_v32_encoding.py +463 -0
  1376. vllm/tokenizers/detokenizer_utils.py +198 -0
  1377. vllm/tokenizers/grok2.py +443 -0
  1378. vllm/tokenizers/hf.py +119 -0
  1379. vllm/tokenizers/mistral.py +543 -0
  1380. vllm/tokenizers/protocol.py +123 -0
  1381. vllm/tokenizers/registry.py +238 -0
  1382. vllm/tool_parsers/__init__.py +158 -0
  1383. vllm/tool_parsers/abstract_tool_parser.py +274 -0
  1384. vllm/tool_parsers/deepseekv31_tool_parser.py +388 -0
  1385. vllm/tool_parsers/deepseekv32_tool_parser.py +591 -0
  1386. vllm/tool_parsers/deepseekv3_tool_parser.py +390 -0
  1387. vllm/tool_parsers/ernie45_tool_parser.py +210 -0
  1388. vllm/tool_parsers/functiongemma_tool_parser.py +321 -0
  1389. vllm/tool_parsers/gigachat3_tool_parser.py +190 -0
  1390. vllm/tool_parsers/glm47_moe_tool_parser.py +23 -0
  1391. vllm/tool_parsers/glm4_moe_tool_parser.py +215 -0
  1392. vllm/tool_parsers/granite_20b_fc_tool_parser.py +273 -0
  1393. vllm/tool_parsers/granite_tool_parser.py +253 -0
  1394. vllm/tool_parsers/hermes_tool_parser.py +495 -0
  1395. vllm/tool_parsers/hunyuan_a13b_tool_parser.py +420 -0
  1396. vllm/tool_parsers/internlm2_tool_parser.py +227 -0
  1397. vllm/tool_parsers/jamba_tool_parser.py +323 -0
  1398. vllm/tool_parsers/kimi_k2_tool_parser.py +598 -0
  1399. vllm/tool_parsers/llama4_pythonic_tool_parser.py +341 -0
  1400. vllm/tool_parsers/llama_tool_parser.py +324 -0
  1401. vllm/tool_parsers/longcat_tool_parser.py +37 -0
  1402. vllm/tool_parsers/minimax_m2_tool_parser.py +776 -0
  1403. vllm/tool_parsers/minimax_tool_parser.py +849 -0
  1404. vllm/tool_parsers/mistral_tool_parser.py +612 -0
  1405. vllm/tool_parsers/olmo3_tool_parser.py +366 -0
  1406. vllm/tool_parsers/openai_tool_parser.py +111 -0
  1407. vllm/tool_parsers/phi4mini_tool_parser.py +120 -0
  1408. vllm/tool_parsers/pythonic_tool_parser.py +332 -0
  1409. vllm/tool_parsers/qwen3coder_tool_parser.py +781 -0
  1410. vllm/tool_parsers/qwen3xml_tool_parser.py +1316 -0
  1411. vllm/tool_parsers/seed_oss_tool_parser.py +744 -0
  1412. vllm/tool_parsers/step3_tool_parser.py +303 -0
  1413. vllm/tool_parsers/utils.py +229 -0
  1414. vllm/tool_parsers/xlam_tool_parser.py +556 -0
  1415. vllm/tracing.py +135 -0
  1416. vllm/transformers_utils/__init__.py +26 -0
  1417. vllm/transformers_utils/chat_templates/__init__.py +5 -0
  1418. vllm/transformers_utils/chat_templates/registry.py +73 -0
  1419. vllm/transformers_utils/chat_templates/template_basic.jinja +3 -0
  1420. vllm/transformers_utils/chat_templates/template_blip2.jinja +11 -0
  1421. vllm/transformers_utils/chat_templates/template_chatml.jinja +10 -0
  1422. vllm/transformers_utils/chat_templates/template_deepseek_ocr.jinja +14 -0
  1423. vllm/transformers_utils/chat_templates/template_deepseek_vl2.jinja +23 -0
  1424. vllm/transformers_utils/chat_templates/template_fuyu.jinja +3 -0
  1425. vllm/transformers_utils/chat_templates/template_minicpmv45.jinja +93 -0
  1426. vllm/transformers_utils/config.py +1169 -0
  1427. vllm/transformers_utils/config_parser_base.py +20 -0
  1428. vllm/transformers_utils/configs/__init__.py +106 -0
  1429. vllm/transformers_utils/configs/afmoe.py +87 -0
  1430. vllm/transformers_utils/configs/arctic.py +216 -0
  1431. vllm/transformers_utils/configs/bagel.py +53 -0
  1432. vllm/transformers_utils/configs/chatglm.py +75 -0
  1433. vllm/transformers_utils/configs/deepseek_vl2.py +126 -0
  1434. vllm/transformers_utils/configs/dotsocr.py +71 -0
  1435. vllm/transformers_utils/configs/eagle.py +90 -0
  1436. vllm/transformers_utils/configs/falcon.py +89 -0
  1437. vllm/transformers_utils/configs/flex_olmo.py +82 -0
  1438. vllm/transformers_utils/configs/hunyuan_vl.py +322 -0
  1439. vllm/transformers_utils/configs/isaac.py +100 -0
  1440. vllm/transformers_utils/configs/jais.py +243 -0
  1441. vllm/transformers_utils/configs/kimi_linear.py +148 -0
  1442. vllm/transformers_utils/configs/kimi_vl.py +38 -0
  1443. vllm/transformers_utils/configs/lfm2_moe.py +163 -0
  1444. vllm/transformers_utils/configs/medusa.py +65 -0
  1445. vllm/transformers_utils/configs/midashenglm.py +103 -0
  1446. vllm/transformers_utils/configs/mistral.py +263 -0
  1447. vllm/transformers_utils/configs/mlp_speculator.py +69 -0
  1448. vllm/transformers_utils/configs/moonvit.py +33 -0
  1449. vllm/transformers_utils/configs/nemotron.py +220 -0
  1450. vllm/transformers_utils/configs/nemotron_h.py +284 -0
  1451. vllm/transformers_utils/configs/olmo3.py +83 -0
  1452. vllm/transformers_utils/configs/ovis.py +182 -0
  1453. vllm/transformers_utils/configs/qwen3_next.py +277 -0
  1454. vllm/transformers_utils/configs/radio.py +98 -0
  1455. vllm/transformers_utils/configs/speculators/__init__.py +2 -0
  1456. vllm/transformers_utils/configs/speculators/algos.py +38 -0
  1457. vllm/transformers_utils/configs/speculators/base.py +114 -0
  1458. vllm/transformers_utils/configs/step3_vl.py +178 -0
  1459. vllm/transformers_utils/configs/tarsier2.py +24 -0
  1460. vllm/transformers_utils/configs/ultravox.py +120 -0
  1461. vllm/transformers_utils/dynamic_module.py +70 -0
  1462. vllm/transformers_utils/gguf_utils.py +280 -0
  1463. vllm/transformers_utils/model_arch_config_convertor.py +402 -0
  1464. vllm/transformers_utils/processor.py +424 -0
  1465. vllm/transformers_utils/processors/__init__.py +25 -0
  1466. vllm/transformers_utils/processors/bagel.py +78 -0
  1467. vllm/transformers_utils/processors/deepseek_ocr.py +438 -0
  1468. vllm/transformers_utils/processors/deepseek_vl2.py +406 -0
  1469. vllm/transformers_utils/processors/hunyuan_vl.py +233 -0
  1470. vllm/transformers_utils/processors/hunyuan_vl_image.py +477 -0
  1471. vllm/transformers_utils/processors/ovis.py +453 -0
  1472. vllm/transformers_utils/processors/ovis2_5.py +468 -0
  1473. vllm/transformers_utils/repo_utils.py +287 -0
  1474. vllm/transformers_utils/runai_utils.py +102 -0
  1475. vllm/transformers_utils/s3_utils.py +95 -0
  1476. vllm/transformers_utils/tokenizer.py +19 -0
  1477. vllm/transformers_utils/utils.py +112 -0
  1478. vllm/triton_utils/__init__.py +20 -0
  1479. vllm/triton_utils/importing.py +103 -0
  1480. vllm/usage/__init__.py +0 -0
  1481. vllm/usage/usage_lib.py +278 -0
  1482. vllm/utils/__init__.py +36 -0
  1483. vllm/utils/argparse_utils.py +491 -0
  1484. vllm/utils/async_utils.py +310 -0
  1485. vllm/utils/cache.py +214 -0
  1486. vllm/utils/collection_utils.py +112 -0
  1487. vllm/utils/counter.py +45 -0
  1488. vllm/utils/deep_gemm.py +424 -0
  1489. vllm/utils/flashinfer.py +602 -0
  1490. vllm/utils/func_utils.py +236 -0
  1491. vllm/utils/gc_utils.py +151 -0
  1492. vllm/utils/hashing.py +117 -0
  1493. vllm/utils/import_utils.py +438 -0
  1494. vllm/utils/jsontree.py +158 -0
  1495. vllm/utils/math_utils.py +32 -0
  1496. vllm/utils/mem_constants.py +13 -0
  1497. vllm/utils/mem_utils.py +285 -0
  1498. vllm/utils/nccl.py +64 -0
  1499. vllm/utils/network_utils.py +331 -0
  1500. vllm/utils/nvtx_pytorch_hooks.py +286 -0
  1501. vllm/utils/platform_utils.py +59 -0
  1502. vllm/utils/profiling.py +56 -0
  1503. vllm/utils/registry.py +51 -0
  1504. vllm/utils/serial_utils.py +214 -0
  1505. vllm/utils/system_utils.py +296 -0
  1506. vllm/utils/tensor_schema.py +255 -0
  1507. vllm/utils/torch_utils.py +781 -0
  1508. vllm/v1/__init__.py +0 -0
  1509. vllm/v1/attention/__init__.py +0 -0
  1510. vllm/v1/attention/backend.py +736 -0
  1511. vllm/v1/attention/backends/__init__.py +0 -0
  1512. vllm/v1/attention/backends/cpu_attn.py +501 -0
  1513. vllm/v1/attention/backends/fa_utils.py +126 -0
  1514. vllm/v1/attention/backends/flash_attn.py +1092 -0
  1515. vllm/v1/attention/backends/flash_attn_diffkv.py +277 -0
  1516. vllm/v1/attention/backends/flashinfer.py +1713 -0
  1517. vllm/v1/attention/backends/flex_attention.py +1024 -0
  1518. vllm/v1/attention/backends/gdn_attn.py +382 -0
  1519. vllm/v1/attention/backends/linear_attn.py +77 -0
  1520. vllm/v1/attention/backends/mamba1_attn.py +28 -0
  1521. vllm/v1/attention/backends/mamba2_attn.py +256 -0
  1522. vllm/v1/attention/backends/mamba_attn.py +313 -0
  1523. vllm/v1/attention/backends/mla/__init__.py +0 -0
  1524. vllm/v1/attention/backends/mla/aiter_triton_mla.py +66 -0
  1525. vllm/v1/attention/backends/mla/common.py +2156 -0
  1526. vllm/v1/attention/backends/mla/cutlass_mla.py +278 -0
  1527. vllm/v1/attention/backends/mla/flashattn_mla.py +348 -0
  1528. vllm/v1/attention/backends/mla/flashinfer_mla.py +175 -0
  1529. vllm/v1/attention/backends/mla/flashmla.py +321 -0
  1530. vllm/v1/attention/backends/mla/flashmla_sparse.py +1021 -0
  1531. vllm/v1/attention/backends/mla/indexer.py +345 -0
  1532. vllm/v1/attention/backends/mla/rocm_aiter_mla.py +284 -0
  1533. vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +321 -0
  1534. vllm/v1/attention/backends/mla/triton_mla.py +171 -0
  1535. vllm/v1/attention/backends/registry.py +258 -0
  1536. vllm/v1/attention/backends/rocm_aiter_fa.py +1000 -0
  1537. vllm/v1/attention/backends/rocm_aiter_unified_attn.py +206 -0
  1538. vllm/v1/attention/backends/rocm_attn.py +405 -0
  1539. vllm/v1/attention/backends/short_conv_attn.py +26 -0
  1540. vllm/v1/attention/backends/tree_attn.py +430 -0
  1541. vllm/v1/attention/backends/triton_attn.py +578 -0
  1542. vllm/v1/attention/backends/utils.py +978 -0
  1543. vllm/v1/attention/ops/__init__.py +0 -0
  1544. vllm/v1/attention/ops/chunked_prefill_paged_decode.py +459 -0
  1545. vllm/v1/attention/ops/common.py +469 -0
  1546. vllm/v1/attention/ops/flashmla.py +254 -0
  1547. vllm/v1/attention/ops/merge_attn_states.py +47 -0
  1548. vllm/v1/attention/ops/paged_attn.py +51 -0
  1549. vllm/v1/attention/ops/pallas_kv_cache_update.py +130 -0
  1550. vllm/v1/attention/ops/prefix_prefill.py +862 -0
  1551. vllm/v1/attention/ops/rocm_aiter_mla_sparse.py +210 -0
  1552. vllm/v1/attention/ops/triton_decode_attention.py +709 -0
  1553. vllm/v1/attention/ops/triton_merge_attn_states.py +116 -0
  1554. vllm/v1/attention/ops/triton_prefill_attention.py +272 -0
  1555. vllm/v1/attention/ops/triton_reshape_and_cache_flash.py +395 -0
  1556. vllm/v1/attention/ops/triton_unified_attention.py +1088 -0
  1557. vllm/v1/attention/ops/vit_attn_wrappers.py +185 -0
  1558. vllm/v1/attention/selector.py +145 -0
  1559. vllm/v1/core/__init__.py +0 -0
  1560. vllm/v1/core/block_pool.py +489 -0
  1561. vllm/v1/core/encoder_cache_manager.py +402 -0
  1562. vllm/v1/core/kv_cache_coordinator.py +560 -0
  1563. vllm/v1/core/kv_cache_manager.py +485 -0
  1564. vllm/v1/core/kv_cache_metrics.py +96 -0
  1565. vllm/v1/core/kv_cache_utils.py +1642 -0
  1566. vllm/v1/core/sched/__init__.py +0 -0
  1567. vllm/v1/core/sched/async_scheduler.py +66 -0
  1568. vllm/v1/core/sched/interface.py +205 -0
  1569. vllm/v1/core/sched/output.py +261 -0
  1570. vllm/v1/core/sched/request_queue.py +208 -0
  1571. vllm/v1/core/sched/scheduler.py +1936 -0
  1572. vllm/v1/core/sched/utils.py +64 -0
  1573. vllm/v1/core/single_type_kv_cache_manager.py +926 -0
  1574. vllm/v1/cudagraph_dispatcher.py +183 -0
  1575. vllm/v1/engine/__init__.py +224 -0
  1576. vllm/v1/engine/async_llm.py +874 -0
  1577. vllm/v1/engine/coordinator.py +396 -0
  1578. vllm/v1/engine/core.py +1614 -0
  1579. vllm/v1/engine/core_client.py +1422 -0
  1580. vllm/v1/engine/detokenizer.py +351 -0
  1581. vllm/v1/engine/exceptions.py +18 -0
  1582. vllm/v1/engine/input_processor.py +713 -0
  1583. vllm/v1/engine/llm_engine.py +415 -0
  1584. vllm/v1/engine/logprobs.py +245 -0
  1585. vllm/v1/engine/output_processor.py +715 -0
  1586. vllm/v1/engine/parallel_sampling.py +150 -0
  1587. vllm/v1/engine/utils.py +1086 -0
  1588. vllm/v1/executor/__init__.py +6 -0
  1589. vllm/v1/executor/abstract.py +352 -0
  1590. vllm/v1/executor/multiproc_executor.py +888 -0
  1591. vllm/v1/executor/ray_distributed_executor.py +8 -0
  1592. vllm/v1/executor/ray_executor.py +623 -0
  1593. vllm/v1/executor/ray_utils.py +468 -0
  1594. vllm/v1/executor/uniproc_executor.py +186 -0
  1595. vllm/v1/kv_cache_interface.py +485 -0
  1596. vllm/v1/kv_offload/__init__.py +0 -0
  1597. vllm/v1/kv_offload/abstract.py +161 -0
  1598. vllm/v1/kv_offload/arc_manager.py +237 -0
  1599. vllm/v1/kv_offload/backend.py +97 -0
  1600. vllm/v1/kv_offload/backends/__init__.py +0 -0
  1601. vllm/v1/kv_offload/backends/cpu.py +62 -0
  1602. vllm/v1/kv_offload/cpu.py +109 -0
  1603. vllm/v1/kv_offload/factory.py +58 -0
  1604. vllm/v1/kv_offload/lru_manager.py +139 -0
  1605. vllm/v1/kv_offload/mediums.py +39 -0
  1606. vllm/v1/kv_offload/spec.py +70 -0
  1607. vllm/v1/kv_offload/worker/__init__.py +0 -0
  1608. vllm/v1/kv_offload/worker/cpu_gpu.py +287 -0
  1609. vllm/v1/kv_offload/worker/worker.py +163 -0
  1610. vllm/v1/metrics/__init__.py +0 -0
  1611. vllm/v1/metrics/loggers.py +1320 -0
  1612. vllm/v1/metrics/perf.py +1244 -0
  1613. vllm/v1/metrics/prometheus.py +82 -0
  1614. vllm/v1/metrics/ray_wrappers.py +194 -0
  1615. vllm/v1/metrics/reader.py +257 -0
  1616. vllm/v1/metrics/stats.py +440 -0
  1617. vllm/v1/outputs.py +242 -0
  1618. vllm/v1/pool/__init__.py +0 -0
  1619. vllm/v1/pool/metadata.py +124 -0
  1620. vllm/v1/request.py +281 -0
  1621. vllm/v1/sample/__init__.py +0 -0
  1622. vllm/v1/sample/logits_processor/__init__.py +352 -0
  1623. vllm/v1/sample/logits_processor/builtin.py +278 -0
  1624. vllm/v1/sample/logits_processor/interface.py +106 -0
  1625. vllm/v1/sample/logits_processor/state.py +165 -0
  1626. vllm/v1/sample/metadata.py +44 -0
  1627. vllm/v1/sample/ops/__init__.py +0 -0
  1628. vllm/v1/sample/ops/bad_words.py +57 -0
  1629. vllm/v1/sample/ops/logprobs.py +25 -0
  1630. vllm/v1/sample/ops/penalties.py +57 -0
  1631. vllm/v1/sample/ops/topk_topp_sampler.py +388 -0
  1632. vllm/v1/sample/rejection_sampler.py +822 -0
  1633. vllm/v1/sample/sampler.py +319 -0
  1634. vllm/v1/sample/tpu/__init__.py +0 -0
  1635. vllm/v1/sample/tpu/metadata.py +120 -0
  1636. vllm/v1/sample/tpu/sampler.py +215 -0
  1637. vllm/v1/serial_utils.py +514 -0
  1638. vllm/v1/spec_decode/__init__.py +0 -0
  1639. vllm/v1/spec_decode/eagle.py +1346 -0
  1640. vllm/v1/spec_decode/medusa.py +73 -0
  1641. vllm/v1/spec_decode/metadata.py +66 -0
  1642. vllm/v1/spec_decode/metrics.py +225 -0
  1643. vllm/v1/spec_decode/ngram_proposer.py +281 -0
  1644. vllm/v1/spec_decode/suffix_decoding.py +95 -0
  1645. vllm/v1/spec_decode/utils.py +109 -0
  1646. vllm/v1/structured_output/__init__.py +337 -0
  1647. vllm/v1/structured_output/backend_guidance.py +291 -0
  1648. vllm/v1/structured_output/backend_lm_format_enforcer.py +177 -0
  1649. vllm/v1/structured_output/backend_outlines.py +324 -0
  1650. vllm/v1/structured_output/backend_types.py +136 -0
  1651. vllm/v1/structured_output/backend_xgrammar.py +378 -0
  1652. vllm/v1/structured_output/request.py +91 -0
  1653. vllm/v1/structured_output/utils.py +457 -0
  1654. vllm/v1/utils.py +466 -0
  1655. vllm/v1/worker/__init__.py +0 -0
  1656. vllm/v1/worker/block_table.py +343 -0
  1657. vllm/v1/worker/cp_utils.py +42 -0
  1658. vllm/v1/worker/cpu_model_runner.py +122 -0
  1659. vllm/v1/worker/cpu_worker.py +192 -0
  1660. vllm/v1/worker/dp_utils.py +240 -0
  1661. vllm/v1/worker/ec_connector_model_runner_mixin.py +85 -0
  1662. vllm/v1/worker/gpu/README.md +4 -0
  1663. vllm/v1/worker/gpu/__init__.py +0 -0
  1664. vllm/v1/worker/gpu/async_utils.py +98 -0
  1665. vllm/v1/worker/gpu/attn_utils.py +183 -0
  1666. vllm/v1/worker/gpu/block_table.py +222 -0
  1667. vllm/v1/worker/gpu/buffer_utils.py +224 -0
  1668. vllm/v1/worker/gpu/cudagraph_utils.py +264 -0
  1669. vllm/v1/worker/gpu/dp_utils.py +31 -0
  1670. vllm/v1/worker/gpu/input_batch.py +526 -0
  1671. vllm/v1/worker/gpu/metrics/__init__.py +0 -0
  1672. vllm/v1/worker/gpu/metrics/logits.py +42 -0
  1673. vllm/v1/worker/gpu/mm/__init__.py +0 -0
  1674. vllm/v1/worker/gpu/mm/mrope_utils.py +127 -0
  1675. vllm/v1/worker/gpu/model_runner.py +1005 -0
  1676. vllm/v1/worker/gpu/sample/__init__.py +0 -0
  1677. vllm/v1/worker/gpu/sample/gumbel.py +106 -0
  1678. vllm/v1/worker/gpu/sample/logit_bias.py +270 -0
  1679. vllm/v1/worker/gpu/sample/logprob.py +167 -0
  1680. vllm/v1/worker/gpu/sample/metadata.py +79 -0
  1681. vllm/v1/worker/gpu/sample/min_p.py +58 -0
  1682. vllm/v1/worker/gpu/sample/output.py +14 -0
  1683. vllm/v1/worker/gpu/sample/penalties.py +155 -0
  1684. vllm/v1/worker/gpu/sample/sampler.py +88 -0
  1685. vllm/v1/worker/gpu/spec_decode/__init__.py +18 -0
  1686. vllm/v1/worker/gpu/spec_decode/eagle.py +566 -0
  1687. vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +115 -0
  1688. vllm/v1/worker/gpu/spec_decode/rejection_sample.py +71 -0
  1689. vllm/v1/worker/gpu/states.py +282 -0
  1690. vllm/v1/worker/gpu/structured_outputs.py +100 -0
  1691. vllm/v1/worker/gpu_input_batch.py +1030 -0
  1692. vllm/v1/worker/gpu_model_runner.py +5761 -0
  1693. vllm/v1/worker/gpu_ubatch_wrapper.py +475 -0
  1694. vllm/v1/worker/gpu_worker.py +968 -0
  1695. vllm/v1/worker/kv_connector_model_runner_mixin.py +300 -0
  1696. vllm/v1/worker/lora_model_runner_mixin.py +225 -0
  1697. vllm/v1/worker/tpu_input_batch.py +574 -0
  1698. vllm/v1/worker/tpu_worker.py +18 -0
  1699. vllm/v1/worker/ubatch_utils.py +112 -0
  1700. vllm/v1/worker/ubatching.py +242 -0
  1701. vllm/v1/worker/utils.py +400 -0
  1702. vllm/v1/worker/worker_base.py +372 -0
  1703. vllm/v1/worker/workspace.py +253 -0
  1704. vllm/v1/worker/xpu_model_runner.py +48 -0
  1705. vllm/v1/worker/xpu_worker.py +174 -0
  1706. vllm/version.py +39 -0
  1707. vllm/vllm_flash_attn/.gitkeep +0 -0
  1708. vllm_cpu_avx512bf16-0.14.0.dist-info/METADATA +348 -0
  1709. vllm_cpu_avx512bf16-0.14.0.dist-info/RECORD +1712 -0
  1710. vllm_cpu_avx512bf16-0.14.0.dist-info/WHEEL +5 -0
  1711. vllm_cpu_avx512bf16-0.14.0.dist-info/entry_points.txt +5 -0
  1712. vllm_cpu_avx512bf16-0.14.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3277 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """
4
+ This module defines a framework for sampling benchmark requests from various
5
+ datasets. Each dataset subclass of BenchmarkDataset must implement sample
6
+ generation. Supported dataset types include:
7
+ - ShareGPT
8
+ - Random (synthetic)
9
+ - Sonnet
10
+ - BurstGPT
11
+ - HuggingFace
12
+ - VisionArena
13
+ """
14
+
15
+ import argparse
16
+ import ast
17
+ import base64
18
+ import io
19
+ import json
20
+ import logging
21
+ import math
22
+ import random
23
+ from abc import ABC, abstractmethod
24
+ from collections.abc import Callable, Iterator, Mapping
25
+ from contextlib import suppress
26
+ from copy import deepcopy
27
+ from dataclasses import dataclass
28
+ from functools import cache
29
+ from io import BytesIO
30
+ from tempfile import NamedTemporaryFile
31
+ from typing import Any, cast
32
+
33
+ import numpy as np
34
+ from PIL import Image
35
+ from typing_extensions import deprecated
36
+
37
+ from vllm.lora.request import LoRARequest
38
+ from vllm.lora.utils import get_adapter_absolute_path
39
+ from vllm.multimodal import MultiModalDataDict
40
+ from vllm.multimodal.image import convert_image_mode
41
+ from vllm.tokenizers import TokenizerLike
42
+ from vllm.utils.import_utils import PlaceholderModule
43
+
44
+ try:
45
+ from datasets import load_dataset
46
+ except ImportError:
47
+ datasets = PlaceholderModule("datasets")
48
+ load_dataset = datasets.placeholder_attr("load_dataset")
49
+
50
+ try:
51
+ import pandas as pd
52
+ except ImportError:
53
+ pd = PlaceholderModule("pandas")
54
+
55
+ try:
56
+ import librosa
57
+ except ImportError:
58
+ librosa = PlaceholderModule("librosa")
59
+
60
+ try:
61
+ from vllm.utils.argparse_utils import FlexibleArgumentParser
62
+ except ImportError:
63
+ from argparse import ArgumentParser as FlexibleArgumentParser
64
+
65
+ logger = logging.getLogger(__name__)
66
+
67
+ # -----------------------------------------------------------------------------
68
+ # Data Classes
69
+ # -----------------------------------------------------------------------------
70
+
71
+
72
+ @dataclass
73
+ class SampleRequest:
74
+ """
75
+ Represents a single inference request for benchmarking.
76
+ """
77
+
78
+ prompt: str | list[str]
79
+ prompt_len: int
80
+ expected_output_len: int
81
+ multi_modal_data: MultiModalDataDict | dict | list[dict] | None = None
82
+ lora_request: LoRARequest | None = None
83
+ request_id: str | None = None
84
+
85
+
86
+ # -----------------------------------------------------------------------------
87
+ # Benchmark Dataset Base Class
88
+ # -----------------------------------------------------------------------------
89
+
90
+
91
+ class BenchmarkDataset(ABC):
92
+ DEFAULT_SEED = 0
93
+ IS_MULTIMODAL = False
94
+
95
+ def __init__(
96
+ self,
97
+ dataset_path: str | None = None,
98
+ random_seed: int = DEFAULT_SEED,
99
+ disable_shuffle: bool = False,
100
+ **kwargs,
101
+ ) -> None:
102
+ """
103
+ Initialize the BenchmarkDataset with an optional dataset path and random
104
+ seed.
105
+
106
+ Args:
107
+ dataset_path (Optional[str]): Path to the dataset. If None, it
108
+ indicates that a default or random dataset might be used.
109
+ random_seed (int): Seed value for reproducible shuffling or
110
+ sampling. Defaults to DEFAULT_SEED.
111
+ """
112
+ self.dataset_path = dataset_path
113
+ # Set the random seed, ensuring that a None value is replaced with the
114
+ # default seed.
115
+ self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
116
+ self.disable_shuffle = disable_shuffle
117
+ self.data = None
118
+
119
+ def apply_multimodal_chat_transformation(
120
+ self,
121
+ prompt: str,
122
+ mm_content: MultiModalDataDict | dict | list[dict] | None = None,
123
+ ) -> list[dict]:
124
+ """
125
+ Transform a prompt and optional multimodal content into a chat format.
126
+ This method is used for chat models that expect a specific conversation
127
+ format.
128
+ """
129
+ content = [{"text": prompt, "type": "text"}]
130
+ if mm_content is not None:
131
+ if isinstance(mm_content, list):
132
+ content.extend(cast(list[dict[str, Any]], mm_content))
133
+ elif isinstance(mm_content, dict):
134
+ content.append(mm_content)
135
+ else:
136
+ raise TypeError(
137
+ "Could not process multimodal content of type: "
138
+ + f"{type(mm_content)}"
139
+ )
140
+ return [{"role": "user", "content": content}]
141
+
142
+ def load_data(self) -> None:
143
+ """
144
+ Load data from the dataset path into self.data.
145
+
146
+ This method must be overridden by subclasses since the method to load
147
+ data will vary depending on the dataset format and source.
148
+
149
+ Raises:
150
+ NotImplementedError: If a subclass does not implement this method.
151
+ """
152
+ # TODO (jenniferzhao): add support for downloading data
153
+ raise NotImplementedError("load_data must be implemented in subclasses.")
154
+
155
+ def get_random_lora_request(
156
+ self,
157
+ max_loras: int | None = None,
158
+ lora_path: str | None = None,
159
+ ) -> LoRARequest | None:
160
+ """
161
+ Optionally select a random LoRA request.
162
+
163
+ This method is used when LoRA parameters are provided. It randomly
164
+ selects a LoRA based on max_loras.
165
+
166
+ Args:
167
+ max_loras (Optional[int]): The maximum number of LoRAs available.
168
+ If `None`, LoRA is not used.
169
+ lora_path (Optional[str]): Path to the LoRA parameters on disk.
170
+ If `None`, LoRA is not used.
171
+
172
+ Returns:
173
+ A new [`LoRARequest`][vllm.lora.request.LoRARequest]
174
+ (or `None` if not applicable).
175
+ """
176
+ if max_loras is None or lora_path is None:
177
+ return None
178
+
179
+ # Generate a random LoRA ID in the range [1, max_loras].
180
+ lora_id = random.randint(1, max_loras)
181
+ lora_request = LoRARequest(
182
+ lora_name=str(lora_id),
183
+ lora_int_id=lora_id,
184
+ lora_path=lora_path_on_disk(lora_path),
185
+ )
186
+ return lora_request
187
+
188
+ @abstractmethod
189
+ def sample(
190
+ self,
191
+ tokenizer: TokenizerLike,
192
+ num_requests: int,
193
+ request_id_prefix: str = "",
194
+ no_oversample: bool = False,
195
+ ) -> list[SampleRequest]:
196
+ """
197
+ Abstract method to generate sample requests from the dataset.
198
+
199
+ Subclasses must override this method to implement dataset-specific logic
200
+ for generating a list of SampleRequest objects.
201
+
202
+ Args:
203
+ tokenizer (TokenizerLike): The tokenizer to be used
204
+ for processing the dataset's text.
205
+ num_requests (int): The number of sample requests to generate.
206
+ request_id_prefix (str): The prefix of request_id.
207
+
208
+ Returns:
209
+ list[SampleRequest]: A list of sample requests generated from the
210
+ dataset.
211
+ """
212
+ raise NotImplementedError("sample must be implemented in subclasses.")
213
+
214
+ def maybe_oversample_requests(
215
+ self,
216
+ requests: list[SampleRequest],
217
+ num_requests: int,
218
+ request_id_prefix: str = "",
219
+ no_oversample: bool = False,
220
+ ) -> None:
221
+ """
222
+ Oversamples the list of requests if its size is less than the desired
223
+ number.
224
+
225
+ Args:
226
+ requests (List[SampleRequest]): The current list of sampled
227
+ requests.
228
+ num_requests (int): The target number of requests.
229
+ request_id_prefix (str): The prefix applied to generated request
230
+ identifiers.
231
+
232
+ """
233
+ if no_oversample:
234
+ logger.info("Skipping oversampling. Total samples: %d.", len(requests))
235
+ return
236
+
237
+ if len(requests) < num_requests:
238
+ random.seed(self.random_seed)
239
+ needed = num_requests - len(requests)
240
+ additional = []
241
+ for i in range(needed):
242
+ req = deepcopy(random.choice(requests))
243
+ req.request_id = request_id_prefix + str(len(requests) + i)
244
+ additional.append(req)
245
+ requests.extend(additional)
246
+ logger.info("Oversampled requests to reach %d total samples.", num_requests)
247
+
248
+ ids = [req.request_id for req in requests]
249
+ if len(ids) != len(set(ids)):
250
+ raise ValueError(
251
+ "Duplicate request_id found in the sampled "
252
+ "requests. Please ensure that each request_id "
253
+ "is unique."
254
+ )
255
+
256
+
257
+ # -----------------------------------------------------------------------------
258
+ # Utility Functions and Global Caches
259
+ # -----------------------------------------------------------------------------
260
+
261
+
262
+ def is_valid_sequence(
263
+ prompt_len: int,
264
+ output_len: int,
265
+ min_len: int = 4,
266
+ max_prompt_len: int = 1024,
267
+ max_total_len: int = 2048,
268
+ skip_min_output_len_check: bool = False,
269
+ ) -> bool:
270
+ """
271
+ Validate a sequence based on prompt and output lengths.
272
+
273
+ Default pruning criteria are copied from the original `sample_hf_requests`
274
+ and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
275
+ from `sample_requests` in benchmark_throughput.py.
276
+ """
277
+ # Check for invalid conditions
278
+ prompt_too_short = prompt_len < min_len
279
+ output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
280
+ prompt_too_long = prompt_len > max_prompt_len
281
+ combined_too_long = (prompt_len + output_len) > max_total_len
282
+
283
+ # Return True if none of the invalid conditions are met
284
+ return not (
285
+ prompt_too_short or output_too_short or prompt_too_long or combined_too_long
286
+ )
287
+
288
+
289
+ @cache
290
+ def lora_path_on_disk(lora_path: str) -> str:
291
+ return get_adapter_absolute_path(lora_path)
292
+
293
+
294
+ # Global cache for LoRA tokenizers.
295
+ lora_tokenizer_cache: dict[int, TokenizerLike] = {}
296
+
297
+
298
+ def process_image(image: Any) -> Mapping[str, Any]:
299
+ """
300
+ Process a single image input and return a multimedia content dictionary.
301
+
302
+ Supports the following input types:
303
+
304
+ 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
305
+ containing raw image data. - Loads the bytes as a PIL.Image.Image.
306
+
307
+ 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
308
+ a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
309
+ a dictionary with the image as a base64 data URL.
310
+
311
+ 3. String input: - Treats the string as a URL or local file path. -
312
+ Prepends "file://" if the string doesn't start with "http://" or
313
+ "file://". - Returns a dictionary with the image URL.
314
+
315
+ Raises:
316
+ ValueError: If the input is not a supported type.
317
+ """
318
+ if isinstance(image, dict) and "bytes" in image:
319
+ image = Image.open(BytesIO(image["bytes"]))
320
+ if isinstance(image, Image.Image):
321
+ image = convert_image_mode(image, "RGB")
322
+ with io.BytesIO() as image_data:
323
+ image.save(image_data, format="JPEG")
324
+ image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
325
+ return {
326
+ "type": "image_url",
327
+ "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
328
+ }
329
+
330
+ if isinstance(image, str):
331
+ image_url = (
332
+ image
333
+ if image.startswith(("http://", "https://", "file://"))
334
+ else f"file://{image}"
335
+ )
336
+ return {"type": "image_url", "image_url": {"url": image_url}}
337
+
338
+ raise ValueError(
339
+ f"Invalid image input {image}. Must be a PIL.Image.Image"
340
+ " or str or dictionary with raw image bytes."
341
+ )
342
+
343
+
344
+ def process_video(video: Any) -> Mapping[str, Any]:
345
+ """
346
+ Process a single video input and return a multimedia content dictionary.
347
+
348
+ Supports the following input types:
349
+
350
+ 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
351
+ containing raw video data.
352
+
353
+ 2. String input: - Treats the string as a URL or local file path. -
354
+ Prepends "file://" if the string doesn't start with "http://" or
355
+ "file://". - Returns a dictionary with the image URL.
356
+
357
+ Raises:
358
+ ValueError: If the input is not a supported type.
359
+ """
360
+ if isinstance(video, dict) and "bytes" in video:
361
+ video_bytes = video["bytes"]
362
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
363
+ return {
364
+ "type": "video_url",
365
+ "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
366
+ }
367
+
368
+ if isinstance(video, str):
369
+ video_url = (
370
+ video
371
+ if video.startswith(("http://", "https://", "file://"))
372
+ else f"file://{video}"
373
+ )
374
+ return {"type": "video_url", "video_url": {"url": video_url}}
375
+
376
+ raise ValueError(
377
+ f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
378
+ )
379
+
380
+
381
+ def gen_prompt_decode_to_target_len(
382
+ tokenizer: TokenizerLike,
383
+ token_sequence: list[int],
384
+ target_token_len: int,
385
+ max_retry: int = 10,
386
+ add_special_tokens: bool = False,
387
+ rng: np.random.Generator | None = None,
388
+ ) -> tuple[str, list[int]]:
389
+ """
390
+ Ensure decoded-then-encoded prompt length matches the target token length.
391
+
392
+ This function decodes an initial token sequence to text and re-encodes it
393
+ , iteratively adjusting the token sequence length to match a target.
394
+ This is necessary because some tokenizers do not guarantee a 1:1 mapping
395
+ between consecutive tokens and the decoded-then-encoded sequence length.
396
+ For example, for GPT2Tokenizer:
397
+ [6880, 6881] -> ['Ġcalls', 'here'] ->
398
+ [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
399
+
400
+ Returns a tuple of the final prompt string and the adjusted token sequence.
401
+ """
402
+ remain_num_try = max_retry
403
+ token_mismatch = 0
404
+ while True:
405
+ prompt = tokenizer.decode(token_sequence)
406
+ token_sequence = tokenizer.encode(prompt, add_special_tokens=add_special_tokens)
407
+ if remain_num_try <= 0:
408
+ if len(token_sequence) != target_token_len:
409
+ token_mismatch = len(token_sequence) - target_token_len
410
+ break
411
+
412
+ if len(token_sequence) == target_token_len:
413
+ break
414
+ elif len(token_sequence) < target_token_len:
415
+ if rng is not None:
416
+ extra_tokens = rng.integers(
417
+ 0,
418
+ tokenizer.vocab_size,
419
+ size=target_token_len - len(token_sequence),
420
+ ).tolist()
421
+ else:
422
+ extra_tokens = np.random.randint(
423
+ 0,
424
+ tokenizer.vocab_size,
425
+ size=target_token_len - len(token_sequence),
426
+ ).tolist()
427
+ token_sequence.extend(extra_tokens)
428
+ elif len(token_sequence) > target_token_len:
429
+ token_sequence = token_sequence[:target_token_len]
430
+
431
+ remain_num_try -= 1
432
+
433
+ return prompt, token_sequence, token_mismatch
434
+
435
+
436
+ # -----------------------------------------------------------------------------
437
+ # Random Dataset Implementation (Synthetic Data)
438
+ # -----------------------------------------------------------------------------
439
+
440
+
441
+ class RandomDataset(BenchmarkDataset):
442
+ """
443
+ Synthetic text-only dataset for serving/throughput benchmarks.
444
+
445
+ Strategy:
446
+ - Sample input/output token lengths per request from integer-uniform ranges
447
+ around configured means (controlled by range_ratio).
448
+ - Prepend a fixed random prefix of length prefix_len.
449
+ - Generate the remaining tokens as a reproducible sequence:
450
+ (offset + index + arange(input_len)) % vocab_size.
451
+ - Decode then re-encode/truncate to ensure prompt token counts match.
452
+ - Uses numpy.default_rng seeded with random_seed for reproducible sampling.
453
+ """
454
+
455
+ # Default values copied from benchmark_serving.py for the random dataset.
456
+ DEFAULT_PREFIX_LEN = 0
457
+ DEFAULT_RANGE_RATIO = 0.0
458
+ DEFAULT_INPUT_LEN = 1024
459
+ DEFAULT_OUTPUT_LEN = 128
460
+
461
+ def __init__(self, **kwargs) -> None:
462
+ super().__init__(**kwargs)
463
+ # Use numpy's default_rng for deterministic sampling
464
+ # Do not use random.seed() or np.random.seed() elsewhere in this class.
465
+ # This ensures that the RNG is isolated from global RNG state.
466
+ self._rng = np.random.default_rng(self.random_seed)
467
+
468
+ def sample(
469
+ self,
470
+ tokenizer: TokenizerLike,
471
+ num_requests: int,
472
+ request_id_prefix: str = "",
473
+ no_oversample: bool = False,
474
+ prefix_len: int = DEFAULT_PREFIX_LEN,
475
+ range_ratio: float = DEFAULT_RANGE_RATIO,
476
+ input_len: int = DEFAULT_INPUT_LEN,
477
+ output_len: int = DEFAULT_OUTPUT_LEN,
478
+ batchsize: int = 1,
479
+ **kwargs,
480
+ ) -> list[SampleRequest]:
481
+ # validate total input tokens (prefix + sampled) is at least 1.
482
+ num_special = int(tokenizer.num_special_tokens_to_add())
483
+ real_input_len = max(0, int(input_len) - num_special)
484
+ min_sampled_input = math.floor(real_input_len * (1.0 - float(range_ratio)))
485
+ min_total_input = int(prefix_len) + min_sampled_input
486
+ if min_total_input < 1:
487
+ raise ValueError(
488
+ "--random-input-len is too small: with tokenizer special "
489
+ f"tokens {num_special} and --random-range-ratio {range_ratio}, "
490
+ "the minimum possible total input tokens (prefix + sampled) is "
491
+ f"{min_total_input}. Increase --random-input-len and/or "
492
+ "--random-prefix-len, or decrease --random-range-ratio so that "
493
+ "prefix_len + floor(max(0, random_input_len - num_special)) "
494
+ "* (1 - range_ratio) >= 1."
495
+ )
496
+
497
+ input_lens, output_lens, offsets = self.get_sampling_params(
498
+ num_requests, range_ratio, input_len, output_len, tokenizer
499
+ )
500
+
501
+ vocab_size = tokenizer.vocab_size
502
+ prohibited_tokens = tokenizer.all_special_ids
503
+ all_tokens = np.arange(vocab_size)
504
+ allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
505
+
506
+ # Generate prefix once
507
+ prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len)
508
+
509
+ requests = []
510
+ token_mismatch_total = 0
511
+ for i in range(num_requests):
512
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
513
+ tokenizer=tokenizer,
514
+ prefix_token_ids=prefix_token_ids,
515
+ prefix_len=prefix_len,
516
+ vocab_size=vocab_size,
517
+ input_len=int(input_lens[i]),
518
+ offset=int(offsets[i]),
519
+ index=i,
520
+ allowed_tokens=allowed_tokens,
521
+ )
522
+ token_mismatch_total += token_mismatch
523
+ requests.append(
524
+ SampleRequest(
525
+ prompt=prompt,
526
+ prompt_len=total_input_len,
527
+ expected_output_len=int(output_lens[i]),
528
+ request_id=request_id_prefix + str(i),
529
+ )
530
+ )
531
+ # only used for embeddings benchmark.
532
+ if batchsize > 1:
533
+ batch_requests = []
534
+ # Create batched requests
535
+ for i in range(0, num_requests, batchsize):
536
+ batch = requests[i : i + batchsize]
537
+ batch_requests.append(
538
+ SampleRequest(
539
+ prompt=[req.prompt for req in batch],
540
+ prompt_len=sum(req.prompt_len for req in batch),
541
+ expected_output_len=0,
542
+ request_id=request_id_prefix + str(i // batchsize),
543
+ )
544
+ )
545
+ requests = batch_requests
546
+
547
+ if token_mismatch_total != 0:
548
+ sign = "more" if token_mismatch_total > 0 else "fewer"
549
+ logger.warning(
550
+ "Across all generated prompts, there were %d %s tokens "
551
+ "than expected after decoding and re-encoding. This is "
552
+ "expected due to the imperfect nature of the sampling "
553
+ "procedure.",
554
+ abs(token_mismatch_total),
555
+ sign,
556
+ )
557
+
558
+ return requests
559
+
560
+ def get_prefix(
561
+ self,
562
+ allowed_tokens: np.ndarray,
563
+ prefix_len: int,
564
+ ) -> list[int]:
565
+ """
566
+ Get the prefix for the dataset.
567
+ """
568
+ return (
569
+ allowed_tokens[
570
+ self._rng.integers(0, len(allowed_tokens), size=prefix_len)
571
+ ].tolist()
572
+ if prefix_len > 0
573
+ else []
574
+ )
575
+
576
+ def get_sampling_params(
577
+ self,
578
+ num_requests: int,
579
+ range_ratio: float,
580
+ input_len: int,
581
+ output_len: int,
582
+ tokenizer: TokenizerLike,
583
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
584
+ """
585
+ Get the sampling parameters for the dataset.
586
+ """
587
+ # Enforce range_ratio < 1
588
+ if not (0.0 <= range_ratio < 1.0):
589
+ raise ValueError("range_ratio must be in [0, 1).")
590
+ num_special_tokens = int(tokenizer.num_special_tokens_to_add())
591
+ real_input_len = max(0, int(input_len) - num_special_tokens)
592
+ # Bounds use floor for low and ceil for high
593
+ input_low = math.floor(real_input_len * (1 - range_ratio))
594
+ input_high = math.ceil(real_input_len * (1 + range_ratio))
595
+ output_low = math.floor(output_len * (1 - range_ratio))
596
+ output_high = math.ceil(output_len * (1 + range_ratio))
597
+ # Ensure the lower bound for output length is at least 1 to
598
+ # prevent sampling 0 tokens.
599
+ output_low = max(output_low, 1)
600
+ output_high = max(output_high, 1)
601
+
602
+ if input_low > input_high:
603
+ raise ValueError(
604
+ f"Invalid input sampling interval: low={input_low} > high={input_high}"
605
+ )
606
+ if output_low > output_high:
607
+ raise ValueError(
608
+ "Invalid output sampling interval: "
609
+ f"low={output_low} > high={output_high}"
610
+ )
611
+
612
+ logger.info(
613
+ "Sampling input_len from [%s, %s] and output_len from [%s, %s]",
614
+ input_low,
615
+ input_high,
616
+ output_low,
617
+ output_high,
618
+ )
619
+
620
+ input_lens = self._rng.integers(input_low, input_high + 1, size=num_requests)
621
+ output_lens = self._rng.integers(output_low, output_high + 1, size=num_requests)
622
+ offsets = self._rng.integers(0, tokenizer.vocab_size, size=num_requests)
623
+ return input_lens, output_lens, offsets
624
+
625
+ def generate_token_sequence(
626
+ self,
627
+ *,
628
+ tokenizer: TokenizerLike,
629
+ prefix_token_ids: list[int],
630
+ prefix_len: int,
631
+ vocab_size: int,
632
+ input_len: int,
633
+ offset: int,
634
+ index: int,
635
+ allowed_tokens: np.ndarray,
636
+ ) -> tuple[str, int, int]:
637
+ """
638
+ Returns (prompt, total_input_len).
639
+
640
+ NOTE: After decoding the prompt we have to encode and decode it again.
641
+ This is done because in some cases N consecutive tokens
642
+ give a string tokenized into != N number of tokens.
643
+ For example for GPT2Tokenizer:
644
+ [6880, 6881] -> ['Ġcalls', 'here'] ->
645
+ [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
646
+ To avoid uncontrolled change of the prompt length,
647
+ the encoded sequence is truncated before being decoded again.
648
+ """
649
+ # Build the inner sequence by sampling
650
+ # sequentially from the allowed tokens
651
+ inner_seq = allowed_tokens[
652
+ (offset + index + np.arange(input_len)) % len(allowed_tokens)
653
+ ].tolist()
654
+ token_sequence = prefix_token_ids + inner_seq
655
+
656
+ # Decode, then re-encode and truncate to preserve token count invariants
657
+ total_input_len = prefix_len + int(input_len)
658
+ prompt, adjusted_token_sequence, token_mismatch = (
659
+ gen_prompt_decode_to_target_len(
660
+ tokenizer=tokenizer,
661
+ token_sequence=token_sequence,
662
+ target_token_len=total_input_len,
663
+ add_special_tokens=False,
664
+ rng=self._rng,
665
+ )
666
+ )
667
+ total_input_len = len(adjusted_token_sequence)
668
+ return prompt, total_input_len, token_mismatch
669
+
670
+
671
+ # -----------------------------------------------------------------------------
672
+ # Random Dataset Implementation (Synthetic Data)
673
+ # -----------------------------------------------------------------------------
674
+
675
+
676
+ class RandomDatasetForReranking(RandomDataset):
677
+ """
678
+ Random dataset specialized for the needs of scoring:
679
+ - Batches of inputs
680
+ - Inputs composed of pairs
681
+ """
682
+
683
+ def __init__(self, **kwargs) -> None:
684
+ super().__init__(**kwargs)
685
+
686
+ def sample(
687
+ self,
688
+ tokenizer: TokenizerLike,
689
+ num_requests: int,
690
+ request_id_prefix: str = "",
691
+ range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
692
+ input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
693
+ batchsize: int = 1,
694
+ is_reranker: bool = True,
695
+ **kwargs,
696
+ ) -> list[SampleRequest]:
697
+ n_sep_tokens = int(is_reranker)
698
+
699
+ query_len_param = (input_len // 2) - n_sep_tokens if is_reranker else input_len
700
+
701
+ query_lens, _, query_offsets = self.get_sampling_params(
702
+ 1, range_ratio, query_len_param, 0, tokenizer
703
+ )
704
+
705
+ query_len = int(query_lens[0])
706
+
707
+ if not is_reranker:
708
+ assert num_requests > 1 and batchsize > 1
709
+ num_requests -= 1
710
+ batchsize -= 1
711
+ doc_len_param = input_len
712
+ else:
713
+ doc_len_param = input_len - query_len - n_sep_tokens
714
+
715
+ doc_lens, _, doc_offsets = self.get_sampling_params(
716
+ num_requests, range_ratio, doc_len_param, 0, tokenizer
717
+ )
718
+
719
+ vocab_size = tokenizer.vocab_size
720
+ prohibited_tokens = tokenizer.all_special_ids
721
+ all_tokens = np.arange(vocab_size)
722
+ allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
723
+
724
+ query_prompt, query_input_len, token_mismatch_total = (
725
+ self.generate_token_sequence(
726
+ tokenizer=tokenizer,
727
+ prefix_token_ids=[],
728
+ prefix_len=0,
729
+ vocab_size=vocab_size,
730
+ input_len=query_len,
731
+ offset=int(query_offsets[0]),
732
+ index=0,
733
+ allowed_tokens=allowed_tokens,
734
+ )
735
+ )
736
+
737
+ requests = []
738
+ for i in range(num_requests):
739
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
740
+ tokenizer=tokenizer,
741
+ prefix_token_ids=[],
742
+ prefix_len=0,
743
+ vocab_size=vocab_size,
744
+ input_len=int(doc_lens[i]),
745
+ offset=int(doc_offsets[i]),
746
+ index=i + 1,
747
+ allowed_tokens=allowed_tokens,
748
+ )
749
+ token_mismatch_total += token_mismatch
750
+ requests.append((prompt, total_input_len))
751
+
752
+ batch_requests = []
753
+ # Create batched requests
754
+ for i in range(0, num_requests, batchsize):
755
+ batch = requests[i : i + batchsize]
756
+ query_contrib = (
757
+ (query_input_len + n_sep_tokens) * len(batch)
758
+ if is_reranker
759
+ else query_input_len
760
+ )
761
+ batch_requests.append(
762
+ SampleRequest(
763
+ prompt=[query_prompt] + [req[0] for req in batch],
764
+ prompt_len=query_contrib + sum(req[1] for req in batch),
765
+ expected_output_len=0,
766
+ request_id=request_id_prefix + str(i // batchsize),
767
+ )
768
+ )
769
+
770
+ if token_mismatch_total != 0:
771
+ logger.warning(
772
+ "Across all generated prompts, there were %d %s tokens "
773
+ "than expected after decoding and re-encoding. This is "
774
+ "expected due to the imperfect nature of the sampling "
775
+ "procedure.",
776
+ abs(token_mismatch_total),
777
+ "more" if token_mismatch_total > 0 else "fewer",
778
+ )
779
+
780
+ return batch_requests
781
+
782
+
783
+ # -----------------------------------------------------------------------------
784
+ # MultiModalDataset Implementation
785
+ # -----------------------------------------------------------------------------
786
+
787
+
788
+ class RandomMultiModalDataset(RandomDataset):
789
+ """
790
+ Synthetic multimodal dataset (text + images) that extends RandomDataset.
791
+
792
+ Status:
793
+ - Images: supported via synthetic RGB data.
794
+ - Video: supported via synthetic RGB data.
795
+ - Audio: not yet supported.
796
+
797
+ Sampling overview:
798
+ 1) Number of items per request is sampled uniformly from the integer range
799
+ [floor(n·(1−r)), ceil(n·(1+r))], where n is the base count and r is
800
+ `num_mm_items_range_ratio` in [0, 1]. r=0 keeps it fixed; r=1 allows 0.
801
+ The maximum is further clamped to the sum of per-modality limits.
802
+ 2) Each item’s modality and shape is sampled from `bucket_config`, a dict
803
+ mapping (height, width, num_frames) → probability. We treat
804
+ `num_frames`=1 as image and `num_frames` > 1 as video.
805
+ Entries with zero probability are removed and the rest are renormalized
806
+ to sum to 1.
807
+ 3) Per-modality hard caps are enforced via `limit_mm_per_prompt`.
808
+ When a modality reaches its cap, all of its buckets are excluded and the
809
+ remaining probabilities are renormalized.
810
+
811
+ Example bucket configuration:
812
+ {(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.1}
813
+ - Two image buckets (`num_frames`=1) and one video bucket
814
+ (`num_frames`=16).
815
+ OBS.: Only image sampling is supported for now.
816
+ """
817
+
818
+ IS_MULTIMODAL = True
819
+ DEFAULT_LIMIT_MM_PER_PROMPT = {"image": 255, "video": 1}
820
+
821
+ DEFAULT_BASE_ITEMS_PER_REQUEST = 1
822
+ DEFAULT_NUM_MM_ITEMS_RANGE_RATIO = 0.0
823
+ DEFAULT_MM_ITEM_BUCKET_CONFIG = {
824
+ (256, 256, 1): 0.5,
825
+ (720, 1280, 1): 0.5,
826
+ (720, 1280, 16): 0.0,
827
+ }
828
+ DEFAULT_ENABLE_MULTIMODAL_CHAT = False
829
+
830
+ def __init__(self, **kwargs) -> None:
831
+ super().__init__(**kwargs)
832
+
833
+ def generate_synthetic_image(self, width: int, height: int) -> Image.Image:
834
+ """Generate synthetic PIL image with random RGB values.
835
+
836
+ NOTE: iid pixel sampling results in worst-case compression
837
+ (good for stressing I/O), but very unlike real photos.
838
+ We could consider a “low-freq” mode (e.g., noise blur)
839
+ to emulate network realism instead of max stress.
840
+ """
841
+ random_pixels = self._rng.integers(
842
+ 0,
843
+ 256,
844
+ (height, width, 3),
845
+ dtype=np.uint8,
846
+ )
847
+ return Image.fromarray(random_pixels)
848
+
849
+ def generate_synthetic_video(
850
+ self, width: int, height: int, num_frames: int
851
+ ) -> dict:
852
+ """Generate synthetic video with random values.
853
+
854
+ Creates a video with random pixel values, encodes it to MP4 format,
855
+ and returns the content as bytes.
856
+ """
857
+ import cv2
858
+
859
+ random_pixels = self._rng.integers(
860
+ 0,
861
+ 256,
862
+ (num_frames, height, width, 3),
863
+ dtype=np.uint8,
864
+ )
865
+
866
+ # Create a temporary video file in memory
867
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
868
+ fps = 30 # frames per second
869
+
870
+ with NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
871
+ temp_path = temp_file.name
872
+
873
+ # Create video writer
874
+ video_writer = cv2.VideoWriter(
875
+ temp_path, fourcc=fourcc, fps=fps, frameSize=(width, height)
876
+ )
877
+
878
+ if not video_writer.isOpened():
879
+ raise RuntimeError("Failed to create video writer")
880
+
881
+ for frame in random_pixels:
882
+ video_writer.write(frame)
883
+
884
+ video_writer.release()
885
+ temp_file.close()
886
+
887
+ # Read the video file content
888
+ with open(temp_path, "rb") as f:
889
+ video_content = f.read()
890
+
891
+ return {"bytes": video_content}
892
+
893
+ def map_config_to_modality(self, config: tuple[int, int, int]) -> str:
894
+ """Map the configuration to the modality."""
895
+ if config[-1] == 1:
896
+ return "image"
897
+ elif config[-1] > 1:
898
+ return "video"
899
+ else:
900
+ raise ValueError(f"Invalid multimodal item configuration: {config}")
901
+
902
+ def normalize_bucket_config(
903
+ self, bucket_config: dict[tuple[int, int, int], float]
904
+ ) -> dict[tuple[int, int, int], float]:
905
+ """
906
+ Remove zero probability entries
907
+ and normalize the bucket config to sum to 1.
908
+ """
909
+ # Raise error if value is negative
910
+ if any(v < 0 for v in bucket_config.values()):
911
+ raise ValueError("Bucket config values must be non-negative.")
912
+ # Remove zero probability entries
913
+ bucket_config = {k: v for k, v in bucket_config.items() if v > 0}
914
+ # if bucket config is empty, raise error
915
+ if not bucket_config:
916
+ raise ValueError(
917
+ "Got invalid bucket config. Bucket config values must be non-zero."
918
+ )
919
+ # Normalize the remaining bucket config to sum to 1
920
+ total = sum(bucket_config.values())
921
+ return {k: v / total for k, v in bucket_config.items()}
922
+
923
+ def generate_mm_item(
924
+ self,
925
+ mm_item_config: tuple[int, int, int],
926
+ ) -> Mapping[str, Any]:
927
+ """
928
+ Create synthetic images and videos and
929
+ apply process_image/process_video respectively.
930
+ This follows the OpenAI API chat completions
931
+ https://github.com/openai/openai-python
932
+ """
933
+
934
+ if self.map_config_to_modality(mm_item_config) == "image":
935
+ return process_image(
936
+ self.generate_synthetic_image(mm_item_config[1], mm_item_config[0])
937
+ )
938
+ elif self.map_config_to_modality(mm_item_config) == "video":
939
+ return process_video(
940
+ self.generate_synthetic_video(
941
+ mm_item_config[1], mm_item_config[0], mm_item_config[2]
942
+ )
943
+ )
944
+ else:
945
+ raise ValueError(f"Invalid multimodal item configuration: {mm_item_config}")
946
+
947
+ def get_mm_item_sampling_params(
948
+ self,
949
+ base_items_per_request: int,
950
+ num_mm_items_range_ratio: float,
951
+ limit_mm_per_prompt: dict[str, int],
952
+ bucket_config: dict[tuple[int, int, int], float],
953
+ ) -> tuple[int, int, dict[str, int], dict[tuple[int, int, int], float]]:
954
+ """
955
+ Get the sampling parameters for the multimodal items.
956
+ """
957
+ # Enforce num_mm_items_range_ratio <= 1
958
+ if not (0.0 <= num_mm_items_range_ratio <= 1.0):
959
+ raise ValueError("num_mm_items_range_ratio must be in [0, 1].")
960
+
961
+ # Ensure modalities to sample are in limit_mm_per_prompt
962
+ for k, v in bucket_config.items():
963
+ # get modality from bucket config
964
+ modality = self.map_config_to_modality(k)
965
+ if modality not in limit_mm_per_prompt:
966
+ raise ValueError(
967
+ f"Modality {modality} is not in "
968
+ f"limit_mm_per_prompt: "
969
+ f"{limit_mm_per_prompt.keys()}"
970
+ )
971
+
972
+ # Remove zero probability entries
973
+ # and normalize bucket config to sum to 1
974
+ bucket_config = self.normalize_bucket_config(bucket_config)
975
+ logger.info(
976
+ "Normalized bucket config: %s",
977
+ bucket_config,
978
+ )
979
+ # Only consider limit per prompt for modalities in bucket config
980
+ allowed_modalities = {self.map_config_to_modality(cfg) for cfg in bucket_config}
981
+ limit_mm_per_prompt = {
982
+ k: v for k, v in limit_mm_per_prompt.items() if k in allowed_modalities
983
+ }
984
+ if not limit_mm_per_prompt:
985
+ raise ValueError("No valid limits for modalities present in bucket_config.")
986
+
987
+ logger.info(
988
+ "Updated mm-limit-per-prompt: %s",
989
+ limit_mm_per_prompt,
990
+ )
991
+
992
+ # Get max and min num mm items and ensure
993
+ # it is at most the sum of limit_mm_per_prompt for all modalities
994
+ max_num_mm_items = min(
995
+ sum(limit_mm_per_prompt.values()),
996
+ math.ceil(base_items_per_request * (1 + num_mm_items_range_ratio)),
997
+ )
998
+ # Ensure min num mm items is at least 0
999
+ min_num_mm_items = max(
1000
+ 0, math.floor(base_items_per_request * (1 - num_mm_items_range_ratio))
1001
+ )
1002
+ # Raise error if min num mm items is greater than max num mm items
1003
+ if min_num_mm_items > max_num_mm_items:
1004
+ raise ValueError(
1005
+ f"Min num mm items is greater than max mm items: "
1006
+ f"{min_num_mm_items} > {max_num_mm_items}"
1007
+ )
1008
+
1009
+ logger.info(
1010
+ "Sampling number of multimodal items from [%s, %s]",
1011
+ min_num_mm_items,
1012
+ max_num_mm_items,
1013
+ )
1014
+
1015
+ return (
1016
+ min_num_mm_items,
1017
+ max_num_mm_items,
1018
+ limit_mm_per_prompt,
1019
+ bucket_config,
1020
+ )
1021
+
1022
+ def get_mm_item_iterator(
1023
+ self,
1024
+ min_num_mm_items: int,
1025
+ max_num_mm_items: int,
1026
+ bucket_config: dict[tuple[int, int, int], float],
1027
+ limit_mm_per_prompt: dict[str, int],
1028
+ ) -> Iterator[tuple[int, int, int]]:
1029
+ """
1030
+ Iterator over the multimodal items for each request
1031
+ whose size is between min_num_mm_items and max_num_mm_items.
1032
+
1033
+ Loop over the bucket config and sample a multimodal item.
1034
+ Loop until the number of multimodal items sampled is equal to
1035
+ request_num_mm_items or limit of multimodal items per prompt
1036
+ for all modalities is reached.
1037
+
1038
+ Note:
1039
+ - This function operates on a per-request shallow copy of
1040
+ `bucket_config` (tuple->float). The original dict passed to
1041
+ `sample` is not mutated. If this ever changes, a test
1042
+ is implemented and will fail.
1043
+ """
1044
+ # Get the number of multimodal items to sample
1045
+ request_num_mm_items = int(
1046
+ self._rng.integers(min_num_mm_items, max_num_mm_items + 1)
1047
+ )
1048
+ # If request_num_mm_items is 0, yield an empty iterator
1049
+ if request_num_mm_items == 0:
1050
+ return
1051
+ # Initialize modality counters
1052
+ modality_counter = {self.map_config_to_modality(k): 0 for k in bucket_config}
1053
+ # Copy the bucket config to avoid modifying the original
1054
+ bucket_config_copy = bucket_config.copy()
1055
+ # Loop over the number of multimodal items to sample
1056
+ while sum(modality_counter.values()) < request_num_mm_items:
1057
+ # Sample a multimodal item config
1058
+ mm_item_config = self._rng.choice(
1059
+ list(bucket_config_copy.keys()), p=list(bucket_config_copy.values())
1060
+ )
1061
+ modality = self.map_config_to_modality(mm_item_config)
1062
+ # Check that modality count is less than limit per prompt
1063
+ if modality_counter[modality] < limit_mm_per_prompt[modality]:
1064
+ modality_counter[modality] += 1
1065
+ yield (mm_item_config)
1066
+ else:
1067
+ # If the counter is greater than the limit per prompt
1068
+ # set all multimodal items of this modality to 0
1069
+ for k, v in bucket_config_copy.items():
1070
+ if self.map_config_to_modality(k) == modality:
1071
+ bucket_config_copy[k] = 0
1072
+ # If all configs are 0, break the loop
1073
+ # This should not happen as request_num_mm_items is at most
1074
+ # the sum of limit_mm_per_prompt for all modalities
1075
+ if all(v == 0 for v in bucket_config_copy.values()):
1076
+ logger.warning(
1077
+ "Exhausted all multimodal items of modality %s", modality
1078
+ )
1079
+ break
1080
+ # Renormalize the bucket config
1081
+ bucket_config_copy = self.normalize_bucket_config(bucket_config_copy)
1082
+
1083
+ def sample(
1084
+ self,
1085
+ tokenizer: TokenizerLike,
1086
+ num_requests: int,
1087
+ request_id_prefix: str = "",
1088
+ no_oversample: bool = False,
1089
+ prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
1090
+ range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
1091
+ input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
1092
+ output_len: int = RandomDataset.DEFAULT_OUTPUT_LEN,
1093
+ limit_mm_per_prompt: dict[str, int] = DEFAULT_LIMIT_MM_PER_PROMPT,
1094
+ base_items_per_request: int = DEFAULT_BASE_ITEMS_PER_REQUEST,
1095
+ num_mm_items_range_ratio: float = DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
1096
+ bucket_config: dict[
1097
+ tuple[int, int, int], float
1098
+ ] = DEFAULT_MM_ITEM_BUCKET_CONFIG,
1099
+ enable_multimodal_chat: bool = DEFAULT_ENABLE_MULTIMODAL_CHAT,
1100
+ **kwargs,
1101
+ ) -> list[SampleRequest]:
1102
+ # Get the sampling parameters for the dataset
1103
+ input_lens, output_lens, offsets = self.get_sampling_params(
1104
+ num_requests, range_ratio, input_len, output_len, tokenizer
1105
+ )
1106
+
1107
+ (
1108
+ min_num_mm_items,
1109
+ max_num_mm_items,
1110
+ limit_mm_per_prompt,
1111
+ bucket_config,
1112
+ ) = self.get_mm_item_sampling_params(
1113
+ base_items_per_request,
1114
+ num_mm_items_range_ratio,
1115
+ limit_mm_per_prompt,
1116
+ bucket_config,
1117
+ )
1118
+
1119
+ vocab_size = tokenizer.vocab_size
1120
+ # Can't use tokenizer.all_special_ids since
1121
+ # it returns ONLY ids from special_tokens_map.json
1122
+ # We want to exclude placeholder tokens and all
1123
+ # tokens that indicate start/end of image as it
1124
+ # may break prompt replacement logic.
1125
+ prohibited_tokens = list(
1126
+ tok_id
1127
+ for tok_id, token in tokenizer.added_tokens_decoder.items()
1128
+ if token.special
1129
+ )
1130
+ all_tokens = np.arange(vocab_size)
1131
+ allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens)))
1132
+ logger.debug(
1133
+ "Sampling from %d out of %d (vocab size)", len(allowed_tokens), vocab_size
1134
+ )
1135
+ # Generate prefix once
1136
+ prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len)
1137
+ # Add synthetic multimodal items to each request
1138
+ mm_requests = []
1139
+ token_mismatch_total = 0
1140
+ for i in range(num_requests):
1141
+ prompt, total_input_len, token_mismatch = self.generate_token_sequence( # noqa: E501
1142
+ tokenizer=tokenizer,
1143
+ prefix_token_ids=prefix_token_ids,
1144
+ prefix_len=prefix_len,
1145
+ vocab_size=vocab_size,
1146
+ input_len=int(input_lens[i]),
1147
+ offset=int(offsets[i]),
1148
+ index=i,
1149
+ allowed_tokens=allowed_tokens,
1150
+ )
1151
+ token_mismatch_total += token_mismatch
1152
+ # Get multimodal item iterator for a given request
1153
+ mm_item_iterator = self.get_mm_item_iterator(
1154
+ min_num_mm_items,
1155
+ max_num_mm_items,
1156
+ bucket_config,
1157
+ limit_mm_per_prompt,
1158
+ )
1159
+
1160
+ mm_content = cast(
1161
+ list[dict[str, Any]],
1162
+ [
1163
+ self.generate_mm_item(mm_item_config)
1164
+ for mm_item_config in mm_item_iterator
1165
+ ],
1166
+ )
1167
+
1168
+ if enable_multimodal_chat:
1169
+ # NOTE: For now this option is only provided for completeness
1170
+ # given that the serve.py benchmark currently does not use it.
1171
+ mm_chat_prompt: Any = prompt
1172
+ mm_chat_prompt = self.apply_multimodal_chat_transformation(
1173
+ prompt, mm_content
1174
+ )
1175
+ sample_request = SampleRequest(
1176
+ prompt=mm_chat_prompt,
1177
+ prompt_len=total_input_len,
1178
+ expected_output_len=int(output_lens[i]),
1179
+ multi_modal_data=None,
1180
+ request_id=request_id_prefix + str(i),
1181
+ )
1182
+ else:
1183
+ sample_request = SampleRequest(
1184
+ prompt=prompt,
1185
+ prompt_len=total_input_len,
1186
+ expected_output_len=int(output_lens[i]),
1187
+ multi_modal_data=mm_content,
1188
+ request_id=request_id_prefix + str(i),
1189
+ )
1190
+ mm_requests.append(sample_request)
1191
+
1192
+ if token_mismatch_total != 0:
1193
+ sign = "more" if token_mismatch_total > 0 else "fewer"
1194
+ logger.warning(
1195
+ "Across all generated prompts, there were %d %s tokens "
1196
+ "than expected after decoding and re-encoding. This is "
1197
+ "expected due to the imperfect nature of the sampling "
1198
+ "procedure.",
1199
+ abs(token_mismatch_total),
1200
+ sign,
1201
+ )
1202
+
1203
+ return mm_requests
1204
+
1205
+
1206
+ # -----------------------------------------------------------------------------
1207
+ # ShareGPT Dataset Implementation
1208
+ # -----------------------------------------------------------------------------
1209
+
1210
+
1211
+ class ShareGPTDataset(BenchmarkDataset):
1212
+ """
1213
+ Implements the ShareGPT dataset. Loads data from a JSON file and generates
1214
+ sample requests based on conversation turns.
1215
+ """
1216
+
1217
+ def __init__(self, **kwargs) -> None:
1218
+ super().__init__(**kwargs)
1219
+ self.load_data()
1220
+
1221
+ def load_data(self) -> None:
1222
+ if self.dataset_path is None:
1223
+ raise ValueError("dataset_path must be provided for loading data.")
1224
+
1225
+ with open(self.dataset_path, encoding="utf-8") as f:
1226
+ self.data = json.load(f)
1227
+ # Filter entries with at least two conversation turns.
1228
+ self.data = [
1229
+ entry
1230
+ for entry in self.data
1231
+ if "conversations" in entry and len(entry["conversations"]) >= 2
1232
+ ]
1233
+ random.seed(self.random_seed)
1234
+ if not getattr(self, "disable_shuffle", False):
1235
+ random.shuffle(self.data)
1236
+
1237
+ def sample(
1238
+ self,
1239
+ tokenizer: TokenizerLike,
1240
+ num_requests: int,
1241
+ lora_path: str | None = None,
1242
+ max_loras: int | None = None,
1243
+ output_len: int | None = None,
1244
+ enable_multimodal_chat: bool = False,
1245
+ request_id_prefix: str = "",
1246
+ no_oversample: bool = False,
1247
+ **kwargs,
1248
+ ) -> list:
1249
+ samples: list = []
1250
+ ind = 0
1251
+ for entry in self.data:
1252
+ if len(samples) >= num_requests:
1253
+ break
1254
+ prompt, completion = (
1255
+ entry["conversations"][0]["value"],
1256
+ entry["conversations"][1]["value"],
1257
+ )
1258
+
1259
+ lora_request = self.get_random_lora_request(
1260
+ max_loras=max_loras, lora_path=lora_path
1261
+ )
1262
+ prompt_ids = tokenizer(prompt).input_ids
1263
+ completion_ids = tokenizer(completion).input_ids
1264
+ prompt_len = len(prompt_ids)
1265
+ new_output_len = len(completion_ids) if output_len is None else output_len
1266
+ if not is_valid_sequence(
1267
+ prompt_len,
1268
+ new_output_len,
1269
+ skip_min_output_len_check=output_len is not None,
1270
+ ):
1271
+ continue
1272
+ if image_path := entry.get("image"):
1273
+ mm_content = process_image(image_path)
1274
+ elif video_path := entry.get("video"):
1275
+ mm_content = process_video(video_path)
1276
+ else:
1277
+ mm_content = None
1278
+ if enable_multimodal_chat:
1279
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
1280
+ samples.append(
1281
+ SampleRequest(
1282
+ prompt=prompt,
1283
+ prompt_len=prompt_len,
1284
+ expected_output_len=new_output_len,
1285
+ lora_request=lora_request,
1286
+ multi_modal_data=mm_content,
1287
+ request_id=request_id_prefix + str(ind),
1288
+ )
1289
+ )
1290
+ ind += 1
1291
+ self.maybe_oversample_requests(
1292
+ samples, num_requests, request_id_prefix, no_oversample
1293
+ )
1294
+ return samples
1295
+
1296
+
1297
+ class _ValidateDatasetArgs(argparse.Action):
1298
+ """Argparse action to validate dataset name and path compatibility."""
1299
+
1300
+ def __call__(self, parser, namespace, values, option_string=None):
1301
+ setattr(namespace, self.dest, values)
1302
+
1303
+ # Get current values of both dataset_name and dataset_path
1304
+ dataset_name = getattr(namespace, "dataset_name", "random")
1305
+ dataset_path = getattr(namespace, "dataset_path", None)
1306
+
1307
+ # Validate the combination
1308
+ if dataset_name == "random" and dataset_path is not None:
1309
+ parser.error(
1310
+ "Cannot use 'random' dataset with --dataset-path. "
1311
+ "Please specify the appropriate --dataset-name (e.g., "
1312
+ "'sharegpt', 'custom', 'sonnet') for your dataset file: "
1313
+ f"{dataset_path}"
1314
+ )
1315
+
1316
+
1317
+ def add_dataset_parser(parser: FlexibleArgumentParser):
1318
+ parser.add_argument("--seed", type=int, default=0)
1319
+ parser.add_argument(
1320
+ "--num-prompts",
1321
+ type=int,
1322
+ default=1000,
1323
+ help="Number of prompts to process.",
1324
+ )
1325
+ parser.add_argument(
1326
+ "--dataset-name",
1327
+ type=str,
1328
+ default="random",
1329
+ action=_ValidateDatasetArgs,
1330
+ choices=[
1331
+ "sharegpt",
1332
+ "burstgpt",
1333
+ "sonnet",
1334
+ "random",
1335
+ "random-mm",
1336
+ "random-rerank",
1337
+ "hf",
1338
+ "custom",
1339
+ "prefix_repetition",
1340
+ "spec_bench",
1341
+ ],
1342
+ help="Name of the dataset to benchmark on.",
1343
+ )
1344
+ parser.add_argument(
1345
+ "--no-stream",
1346
+ action="store_true",
1347
+ help="Do not load the dataset in streaming mode.",
1348
+ )
1349
+ parser.add_argument(
1350
+ "--dataset-path",
1351
+ type=str,
1352
+ default=None,
1353
+ action=_ValidateDatasetArgs,
1354
+ help="Path to the sharegpt/sonnet dataset. "
1355
+ "Or the huggingface dataset ID if using HF dataset.",
1356
+ )
1357
+ parser.add_argument(
1358
+ "--no-oversample",
1359
+ action="store_true",
1360
+ help="Do not oversample if the dataset has fewer samples than num-prompts.",
1361
+ )
1362
+ parser.add_argument(
1363
+ "--skip-chat-template",
1364
+ action="store_true",
1365
+ help="Skip applying chat template to prompt for datasets that support it.",
1366
+ )
1367
+ parser.add_argument(
1368
+ "--disable-shuffle",
1369
+ action="store_true",
1370
+ help="Disable shuffling of dataset samples for deterministic ordering.",
1371
+ )
1372
+
1373
+ # group for dataset specific arguments
1374
+ custom_group = parser.add_argument_group("custom dataset options")
1375
+ custom_group.add_argument(
1376
+ "--custom-output-len",
1377
+ type=int,
1378
+ default=256,
1379
+ help="Number of output tokens per request. Unless it is set to -1, the "
1380
+ "value overrides potential output length loaded from the dataset. It is "
1381
+ "used only for custom dataset.",
1382
+ )
1383
+
1384
+ spec_bench_group = parser.add_argument_group("spec bench dataset options")
1385
+ spec_bench_group.add_argument(
1386
+ "--spec-bench-output-len",
1387
+ type=int,
1388
+ default=256,
1389
+ help="Num of output tokens per request, used only for spec bench dataset.",
1390
+ )
1391
+ spec_bench_group.add_argument(
1392
+ "--spec-bench-category",
1393
+ type=str,
1394
+ default=None,
1395
+ help="Category for spec bench dataset. If None, use all categories.",
1396
+ )
1397
+
1398
+ sonnet_group = parser.add_argument_group("sonnet dataset options")
1399
+ sonnet_group.add_argument(
1400
+ "--sonnet-input-len",
1401
+ type=int,
1402
+ default=550,
1403
+ help="Number of input tokens per request, used only for sonnet dataset.",
1404
+ )
1405
+ sonnet_group.add_argument(
1406
+ "--sonnet-output-len",
1407
+ type=int,
1408
+ default=150,
1409
+ help="Number of output tokens per request, used only for sonnet dataset.",
1410
+ )
1411
+ sonnet_group.add_argument(
1412
+ "--sonnet-prefix-len",
1413
+ type=int,
1414
+ default=200,
1415
+ help="Number of prefix tokens per request, used only for sonnet dataset.",
1416
+ )
1417
+
1418
+ sharegpt_group = parser.add_argument_group("sharegpt dataset options")
1419
+ sharegpt_group.add_argument(
1420
+ "--sharegpt-output-len",
1421
+ type=int,
1422
+ default=None,
1423
+ help="Output length for each request. Overrides the output length "
1424
+ "from the ShareGPT dataset.",
1425
+ )
1426
+
1427
+ blazedit_group = parser.add_argument_group("blazedit dataset options")
1428
+ blazedit_group.add_argument(
1429
+ "--blazedit-min-distance",
1430
+ type=float,
1431
+ default=0.0,
1432
+ help="Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
1433
+ )
1434
+ blazedit_group.add_argument(
1435
+ "--blazedit-max-distance",
1436
+ type=float,
1437
+ default=1.0,
1438
+ help="Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
1439
+ )
1440
+
1441
+ random_group = parser.add_argument_group("random dataset options")
1442
+ add_random_dataset_base_args(random_group)
1443
+
1444
+ random_mm_group = parser.add_argument_group(
1445
+ "random multimodal dataset options extended from random dataset"
1446
+ )
1447
+ add_random_multimodal_dataset_args(random_mm_group)
1448
+
1449
+ hf_group = parser.add_argument_group("hf dataset options")
1450
+ hf_group.add_argument(
1451
+ "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
1452
+ )
1453
+ hf_group.add_argument(
1454
+ "--hf-split", type=str, default=None, help="Split of the HF dataset."
1455
+ )
1456
+ hf_group.add_argument(
1457
+ "--hf-name",
1458
+ type=str,
1459
+ default=None,
1460
+ help=(
1461
+ "Name of the dataset on HuggingFace "
1462
+ "(e.g., 'lmarena-ai/VisionArena-Chat'). "
1463
+ "Specify this if your dataset-path is a local path."
1464
+ ),
1465
+ )
1466
+ hf_group.add_argument(
1467
+ "--hf-output-len",
1468
+ type=int,
1469
+ default=None,
1470
+ help="Output length for each request. Overrides the output lengths "
1471
+ "from the sampled HF dataset.",
1472
+ )
1473
+
1474
+ prefix_repetition_group = parser.add_argument_group(
1475
+ "prefix repetition dataset options"
1476
+ )
1477
+ prefix_repetition_group.add_argument(
1478
+ "--prefix-repetition-prefix-len",
1479
+ type=int,
1480
+ default=256,
1481
+ help="Number of prefix tokens per request, used only for prefix "
1482
+ "repetition dataset.",
1483
+ )
1484
+ prefix_repetition_group.add_argument(
1485
+ "--prefix-repetition-suffix-len",
1486
+ type=int,
1487
+ default=256,
1488
+ help="Number of suffix tokens per request, used only for prefix "
1489
+ "repetition dataset. Total input length is prefix_len + suffix_len.",
1490
+ )
1491
+ prefix_repetition_group.add_argument(
1492
+ "--prefix-repetition-num-prefixes",
1493
+ type=int,
1494
+ default=10,
1495
+ help="Number of prefixes to generate, used only for prefix repetition "
1496
+ "dataset. Prompts per prefix is num_requests // num_prefixes.",
1497
+ )
1498
+ prefix_repetition_group.add_argument(
1499
+ "--prefix-repetition-output-len",
1500
+ type=int,
1501
+ default=128,
1502
+ help="Number of output tokens per request, used only for prefix "
1503
+ "repetition dataset.",
1504
+ )
1505
+
1506
+
1507
+ def add_random_dataset_base_args(
1508
+ parser_or_group: FlexibleArgumentParser | argparse._ArgumentGroup,
1509
+ ) -> None:
1510
+ """Add CLI arguments for base random dataset options.
1511
+
1512
+ This function adds arguments needed for:
1513
+ - random (random dataset)
1514
+ - random-mm (random multimodal dataset)
1515
+ - random-rerank (random dataset for reranking)
1516
+
1517
+ Args:
1518
+ parser_or_group: Either a parser or an argument group to add arguments to.
1519
+ """
1520
+ parser_or_group.add_argument(
1521
+ "--random-input-len",
1522
+ type=int,
1523
+ default=1024,
1524
+ help="Number of input tokens per request, used only for random sampling.",
1525
+ )
1526
+ parser_or_group.add_argument(
1527
+ "--random-output-len",
1528
+ type=int,
1529
+ default=128,
1530
+ help="Number of output tokens per request, used only for random sampling.",
1531
+ )
1532
+ parser_or_group.add_argument(
1533
+ "--random-range-ratio",
1534
+ type=float,
1535
+ default=0.0,
1536
+ help="Range ratio for sampling input/output length, "
1537
+ "used only for random sampling. Must be in the range [0, 1) to define "
1538
+ "a symmetric sampling range"
1539
+ "[length * (1 - range_ratio), length * (1 + range_ratio)].",
1540
+ )
1541
+ parser_or_group.add_argument(
1542
+ "--random-prefix-len",
1543
+ type=int,
1544
+ default=0,
1545
+ help=(
1546
+ "Number of fixed prefix tokens before the random context "
1547
+ "in a request. "
1548
+ "The total input length is the sum of `random-prefix-len` and "
1549
+ "a random "
1550
+ "context length sampled from [input_len * (1 - range_ratio), "
1551
+ "input_len * (1 + range_ratio)]."
1552
+ ),
1553
+ )
1554
+ parser_or_group.add_argument(
1555
+ "--random-batch-size",
1556
+ type=int,
1557
+ default=1,
1558
+ help=("Batch size for random sampling. Only used for embeddings benchmark."),
1559
+ )
1560
+ parser_or_group.add_argument(
1561
+ "--no-reranker",
1562
+ action="store_true",
1563
+ help=(
1564
+ "Whether the model supports reranking natively."
1565
+ " Only used for reranker benchmark."
1566
+ ),
1567
+ )
1568
+
1569
+
1570
+ def add_random_multimodal_dataset_args(
1571
+ parser_or_group: FlexibleArgumentParser | argparse._ArgumentGroup,
1572
+ ) -> None:
1573
+ """Add CLI arguments for random multimodal dataset options.
1574
+
1575
+ This function adds arguments needed for:
1576
+ - random-mm (random multimodal dataset)
1577
+
1578
+ Args:
1579
+ parser_or_group: Either a parser or an argument group to add arguments to.
1580
+ """
1581
+ parser_or_group.add_argument(
1582
+ "--random-mm-base-items-per-request",
1583
+ type=int,
1584
+ default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST,
1585
+ help=(
1586
+ "Base number of multimodal items per request for random-mm. "
1587
+ "Actual per-request count is sampled around this base using "
1588
+ "--random-mm-num-mm-items-range-ratio."
1589
+ ),
1590
+ )
1591
+ parser_or_group.add_argument(
1592
+ "--random-mm-num-mm-items-range-ratio",
1593
+ type=float,
1594
+ default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO,
1595
+ help=(
1596
+ "Range ratio r in [0, 1] for sampling items per request. "
1597
+ "We sample uniformly from the closed integer range "
1598
+ "[floor(n*(1-r)), ceil(n*(1+r))] "
1599
+ "where n is the base items per request. "
1600
+ "r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped "
1601
+ "to the sum of per-modality limits from "
1602
+ "--random-mm-limit-mm-per-prompt. "
1603
+ "An error is raised if the computed min exceeds the max."
1604
+ ),
1605
+ )
1606
+ parser_or_group.add_argument(
1607
+ "--random-mm-limit-mm-per-prompt",
1608
+ type=json.loads,
1609
+ default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT,
1610
+ help=(
1611
+ "Per-modality hard caps for items attached per request, e.g. "
1612
+ '\'{"image": 3, "video": 0}\'. The sampled per-request item '
1613
+ "count is clamped to the sum of these limits. When a modality "
1614
+ "reaches its cap, its buckets are excluded and probabilities are "
1615
+ "renormalized."
1616
+ "OBS.: Only image sampling is supported for now."
1617
+ ),
1618
+ )
1619
+
1620
+ def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]:
1621
+ # If already a dict (e.g., programmatic call), normalize keys
1622
+ def normalize(d: dict) -> dict[tuple[int, int, int], float]:
1623
+ out: dict[tuple[int, int, int], float] = {}
1624
+ for k, val in d.items():
1625
+ key = k
1626
+ if isinstance(key, str):
1627
+ with suppress(Exception):
1628
+ key = ast.literal_eval(key)
1629
+ if not (
1630
+ isinstance(key, tuple)
1631
+ and len(key) == 3
1632
+ and all(isinstance(x, int) for x in key)
1633
+ ):
1634
+ raise ValueError(
1635
+ f"Invalid bucket key {k!r}. Expected tuple (H, W, T)."
1636
+ )
1637
+ out[(int(key[0]), int(key[1]), int(key[2]))] = float(val)
1638
+ return out
1639
+
1640
+ if isinstance(v, dict):
1641
+ return normalize(v)
1642
+ if isinstance(v, str):
1643
+ # Python literal (supports tuple keys)
1644
+ parsed = ast.literal_eval(v)
1645
+ if not isinstance(parsed, dict):
1646
+ raise ValueError("Bucket config must parse to a dict.")
1647
+ return normalize(parsed)
1648
+ raise ValueError("Unsupported value for --random-mm-bucket-config.")
1649
+
1650
+ parser_or_group.add_argument(
1651
+ "--random-mm-bucket-config",
1652
+ type=_parse_mm_bucket_config,
1653
+ default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG,
1654
+ help=(
1655
+ "The bucket config is a dictionary mapping a multimodal item"
1656
+ "sampling configuration to a probability."
1657
+ "Currently allows for 2 modalities: images and videos. "
1658
+ "An bucket key is a tuple of (height, width, num_frames)"
1659
+ "The value is the probability of sampling that specific item. "
1660
+ "Example: "
1661
+ "--random-mm-bucket-config "
1662
+ "{(256, 256, 1): 0.5, (720, 1280, 1): 0.4, (720, 1280, 16): 0.10} "
1663
+ "First item: images with resolution 256x256 w.p. 0.5"
1664
+ "Second item: images with resolution 720x1280 w.p. 0.4 "
1665
+ "Third item: videos with resolution 720x1280 and 16 frames w.p. 0.1"
1666
+ "OBS.: If the probabilities do not sum to 1, they are normalized."
1667
+ "OBS bis.: Only image sampling is supported for now."
1668
+ ),
1669
+ )
1670
+
1671
+
1672
+ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
1673
+ if not hasattr(args, "request_id_prefix"):
1674
+ args.request_id_prefix = ""
1675
+
1676
+ if args.dataset_name == "custom":
1677
+ dataset = CustomDataset(
1678
+ dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
1679
+ )
1680
+ input_requests = dataset.sample(
1681
+ num_requests=args.num_prompts,
1682
+ tokenizer=tokenizer,
1683
+ output_len=args.custom_output_len,
1684
+ skip_chat_template=args.skip_chat_template,
1685
+ request_id_prefix=args.request_id_prefix,
1686
+ no_oversample=args.no_oversample,
1687
+ )
1688
+
1689
+ elif args.dataset_name == "sonnet":
1690
+ dataset = SonnetDataset(
1691
+ dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
1692
+ )
1693
+ # For the "sonnet" dataset, formatting depends on the backend.
1694
+ if args.backend == "openai-chat":
1695
+ input_requests = dataset.sample(
1696
+ num_requests=args.num_prompts,
1697
+ input_len=args.sonnet_input_len,
1698
+ output_len=args.sonnet_output_len,
1699
+ prefix_len=args.sonnet_prefix_len,
1700
+ tokenizer=tokenizer,
1701
+ return_prompt_formatted=False,
1702
+ request_id_prefix=args.request_id_prefix,
1703
+ no_oversample=args.no_oversample,
1704
+ )
1705
+ else:
1706
+ assert tokenizer.chat_template or tokenizer.default_chat_template, (
1707
+ "Tokenizer/model must have chat template for sonnet dataset."
1708
+ )
1709
+ input_requests = dataset.sample(
1710
+ num_requests=args.num_prompts,
1711
+ input_len=args.sonnet_input_len,
1712
+ output_len=args.sonnet_output_len,
1713
+ prefix_len=args.sonnet_prefix_len,
1714
+ tokenizer=tokenizer,
1715
+ return_prompt_formatted=True,
1716
+ request_id_prefix=args.request_id_prefix,
1717
+ no_oversample=args.no_oversample,
1718
+ )
1719
+
1720
+ elif args.dataset_name == "hf":
1721
+ # all following datasets are implemented from the
1722
+ # HuggingFaceDataset base class
1723
+ hf_kwargs = {}
1724
+ if (
1725
+ args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1726
+ or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
1727
+ ):
1728
+ dataset_class = VisionArenaDataset
1729
+ args.hf_split = "train"
1730
+ args.hf_subset = None
1731
+ elif (
1732
+ args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS
1733
+ or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS
1734
+ ):
1735
+ dataset_class = MMVUDataset
1736
+ args.hf_split = "validation"
1737
+ args.hf_subset = None
1738
+ elif (
1739
+ args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1740
+ or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
1741
+ ):
1742
+ dataset_class = InstructCoderDataset
1743
+ args.hf_split = "train"
1744
+ elif (
1745
+ args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
1746
+ or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
1747
+ ):
1748
+ dataset_class = MTBenchDataset
1749
+ args.hf_split = "train"
1750
+ elif (
1751
+ args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
1752
+ or args.hf_name in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS
1753
+ ):
1754
+ dataset_class = MultiModalConversationDataset
1755
+ elif (
1756
+ args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
1757
+ or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
1758
+ ):
1759
+ dataset_class = ConversationDataset
1760
+ elif (
1761
+ args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
1762
+ or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
1763
+ ):
1764
+ dataset_class = AIMODataset
1765
+ args.hf_split = "train"
1766
+ elif (
1767
+ args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501
1768
+ or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
1769
+ ):
1770
+ dataset_class = NextEditPredictionDataset
1771
+ args.hf_split = "train"
1772
+ elif (
1773
+ args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
1774
+ or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
1775
+ ):
1776
+ dataset_class = ASRDataset
1777
+ args.hf_split = "train"
1778
+ elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS:
1779
+ dataset_class = BlazeditDataset
1780
+ args.hf_split = "train"
1781
+ hf_kwargs = {
1782
+ "min_distance": args.blazedit_min_distance,
1783
+ "max_distance": args.blazedit_max_distance,
1784
+ }
1785
+ elif (
1786
+ args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
1787
+ or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
1788
+ ):
1789
+ dataset_class = MLPerfDataset
1790
+ args.hf_split = "train"
1791
+ elif (
1792
+ args.dataset_path in MMStarDataset.SUPPORTED_DATASET_PATHS
1793
+ or args.hf_name in MMStarDataset.SUPPORTED_DATASET_PATHS
1794
+ ):
1795
+ dataset_class = MMStarDataset
1796
+ args.hf_split = "val"
1797
+ args.hf_subset = None
1798
+ else:
1799
+ supported_datasets = set(
1800
+ [
1801
+ dataset_name
1802
+ for cls in HuggingFaceDataset.__subclasses__()
1803
+ for dataset_name in cls.SUPPORTED_DATASET_PATHS
1804
+ ]
1805
+ )
1806
+ raise ValueError(
1807
+ f"Unsupported dataset path: {args.dataset_path}. "
1808
+ "Huggingface dataset only supports dataset_path"
1809
+ f" from one of following: {supported_datasets}. "
1810
+ "Please consider contributing if you would "
1811
+ "like to add support for additional dataset formats."
1812
+ )
1813
+
1814
+ if dataset_class.IS_MULTIMODAL and not (
1815
+ args.backend in ("openai-chat", "openai-audio")
1816
+ or "embeddings-" in args.backend
1817
+ ):
1818
+ # multi-modal benchmark is only available on OpenAI Chat
1819
+ # endpoint-type.
1820
+ raise ValueError(
1821
+ "Multi-modal content is only supported on 'openai-chat' and "
1822
+ "'openai-audio' backends."
1823
+ )
1824
+ input_requests = dataset_class(
1825
+ dataset_path=args.dataset_path,
1826
+ dataset_subset=args.hf_subset,
1827
+ dataset_split=args.hf_split,
1828
+ random_seed=args.seed,
1829
+ no_stream=args.no_stream,
1830
+ hf_name=args.hf_name,
1831
+ disable_shuffle=args.disable_shuffle,
1832
+ ).sample(
1833
+ num_requests=args.num_prompts,
1834
+ tokenizer=tokenizer,
1835
+ output_len=args.hf_output_len,
1836
+ request_id_prefix=args.request_id_prefix,
1837
+ no_oversample=args.no_oversample,
1838
+ skip_chat_template=args.skip_chat_template,
1839
+ **hf_kwargs,
1840
+ )
1841
+
1842
+ else:
1843
+ # For datasets that follow a similar structure, use a mapping.
1844
+ dataset_mapping = {
1845
+ "spec_bench": lambda: SpecBench(
1846
+ dataset_path=args.dataset_path,
1847
+ category=args.spec_bench_category,
1848
+ disable_shuffle=args.disable_shuffle,
1849
+ ).sample(
1850
+ num_requests=args.num_prompts,
1851
+ tokenizer=tokenizer,
1852
+ output_len=args.spec_bench_output_len,
1853
+ request_id_prefix=args.request_id_prefix,
1854
+ no_oversample=args.no_oversample,
1855
+ ),
1856
+ "sharegpt": lambda: ShareGPTDataset(
1857
+ random_seed=args.seed,
1858
+ dataset_path=args.dataset_path,
1859
+ disable_shuffle=args.disable_shuffle,
1860
+ ).sample(
1861
+ tokenizer=tokenizer,
1862
+ num_requests=args.num_prompts,
1863
+ output_len=args.sharegpt_output_len,
1864
+ request_id_prefix=args.request_id_prefix,
1865
+ no_oversample=args.no_oversample,
1866
+ ),
1867
+ "burstgpt": lambda: BurstGPTDataset(
1868
+ random_seed=args.seed,
1869
+ dataset_path=args.dataset_path,
1870
+ disable_shuffle=args.disable_shuffle,
1871
+ ).sample(
1872
+ tokenizer=tokenizer,
1873
+ num_requests=args.num_prompts,
1874
+ request_id_prefix=args.request_id_prefix,
1875
+ no_oversample=args.no_oversample,
1876
+ ),
1877
+ "random": lambda: RandomDataset(
1878
+ random_seed=args.seed,
1879
+ dataset_path=args.dataset_path,
1880
+ disable_shuffle=args.disable_shuffle,
1881
+ ).sample(
1882
+ tokenizer=tokenizer,
1883
+ num_requests=args.num_prompts,
1884
+ prefix_len=args.random_prefix_len,
1885
+ input_len=args.random_input_len,
1886
+ output_len=args.random_output_len,
1887
+ range_ratio=args.random_range_ratio,
1888
+ request_id_prefix=args.request_id_prefix,
1889
+ batchsize=args.random_batch_size,
1890
+ no_oversample=args.no_oversample,
1891
+ ),
1892
+ "random-mm": lambda: RandomMultiModalDataset(
1893
+ random_seed=args.seed,
1894
+ dataset_path=args.dataset_path,
1895
+ disable_shuffle=args.disable_shuffle,
1896
+ ).sample(
1897
+ tokenizer=tokenizer,
1898
+ num_requests=args.num_prompts,
1899
+ prefix_len=args.random_prefix_len,
1900
+ range_ratio=args.random_range_ratio,
1901
+ input_len=args.random_input_len,
1902
+ output_len=args.random_output_len,
1903
+ base_items_per_request=args.random_mm_base_items_per_request,
1904
+ limit_mm_per_prompt=args.random_mm_limit_mm_per_prompt,
1905
+ num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
1906
+ bucket_config=args.random_mm_bucket_config,
1907
+ request_id_prefix=args.request_id_prefix,
1908
+ no_oversample=args.no_oversample,
1909
+ ),
1910
+ "random-rerank": lambda: RandomDatasetForReranking(
1911
+ random_seed=args.seed,
1912
+ dataset_path=args.dataset_path,
1913
+ disable_shuffle=args.disable_shuffle,
1914
+ ).sample(
1915
+ tokenizer=tokenizer,
1916
+ num_requests=args.num_prompts,
1917
+ input_len=args.random_input_len,
1918
+ range_ratio=args.random_range_ratio,
1919
+ request_id_prefix=args.request_id_prefix,
1920
+ batchsize=args.random_batch_size,
1921
+ is_reranker=not args.no_reranker,
1922
+ ),
1923
+ "prefix_repetition": lambda: PrefixRepetitionRandomDataset(
1924
+ random_seed=args.seed,
1925
+ dataset_path=args.dataset_path,
1926
+ disable_shuffle=args.disable_shuffle,
1927
+ ).sample(
1928
+ tokenizer=tokenizer,
1929
+ num_requests=args.num_prompts,
1930
+ prefix_len=args.prefix_repetition_prefix_len,
1931
+ suffix_len=args.prefix_repetition_suffix_len,
1932
+ num_prefixes=args.prefix_repetition_num_prefixes,
1933
+ output_len=args.prefix_repetition_output_len,
1934
+ request_id_prefix=args.request_id_prefix,
1935
+ no_oversample=args.no_oversample,
1936
+ ),
1937
+ }
1938
+
1939
+ try:
1940
+ # Enforce endpoint compatibility for multimodal datasets.
1941
+ if args.dataset_name == "random-mm" and args.backend not in ["openai-chat"]:
1942
+ raise ValueError(
1943
+ "Multi-modal content (images) is only supported on "
1944
+ "'openai-chat' backend."
1945
+ )
1946
+ input_requests = dataset_mapping[args.dataset_name]()
1947
+ except KeyError as err:
1948
+ raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
1949
+
1950
+ return input_requests
1951
+
1952
+
1953
+ # -----------------------------------------------------------------------------
1954
+ # Custom Dataset Implementation
1955
+ # -----------------------------------------------------------------------------
1956
+
1957
+
1958
+ class CustomDataset(BenchmarkDataset):
1959
+ """
1960
+ Implements the Custom dataset. Loads data from a JSONL file and generates
1961
+ sample requests based on conversation turns. E.g.,
1962
+ ```
1963
+ {"prompt": "What is the capital of India?", "output_tokens": 10}
1964
+ {"prompt": "What is the capital of Iran?", "output_tokens": 1520}
1965
+ {"prompt": "What is the capital of China?", "output_tokens": 819}
1966
+ ```
1967
+ Note that 'output_tokens' column is optional and has to be provided only if
1968
+ 'custom-output-len' argument is None or -1.
1969
+ """
1970
+
1971
+ def __init__(self, **kwargs) -> None:
1972
+ super().__init__(**kwargs)
1973
+ self.load_data()
1974
+
1975
+ def load_data(self) -> None:
1976
+ if self.dataset_path is None:
1977
+ raise ValueError("dataset_path must be provided for loading data.")
1978
+
1979
+ # self.data will be a list of dictionaries
1980
+ # e.g., [{"prompt": "What is the capital of India?"}, ...]
1981
+ # This will be the standardized format which load_data()
1982
+ # has to convert into depending on the filetype of dataset_path.
1983
+ # sample() will assume this standardized format of self.data
1984
+ self.data = []
1985
+
1986
+ # Load the JSONL file
1987
+ if self.dataset_path.endswith(".jsonl"):
1988
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
1989
+
1990
+ # check if the JSONL file has a 'prompt' column
1991
+ if "prompt" not in jsonl_data.columns:
1992
+ raise ValueError("JSONL file must contain a 'prompt' column.")
1993
+
1994
+ # Convert each row to a dictionary and append to self.data
1995
+ # This will convert the DataFrame to a list of dictionaries
1996
+ # where each dictionary corresponds to a row in the DataFrame.
1997
+ # This is the standardized format we want for self.data
1998
+ for _, row in jsonl_data.iterrows():
1999
+ self.data.append(row.to_dict())
2000
+ else:
2001
+ raise NotImplementedError(
2002
+ "Only JSONL format is supported for CustomDataset."
2003
+ )
2004
+
2005
+ random.seed(self.random_seed)
2006
+ if not getattr(self, "disable_shuffle", False):
2007
+ random.shuffle(self.data)
2008
+
2009
+ def sample(
2010
+ self,
2011
+ tokenizer: TokenizerLike,
2012
+ num_requests: int,
2013
+ lora_path: str | None = None,
2014
+ max_loras: int | None = None,
2015
+ output_len: int | None = None,
2016
+ enable_multimodal_chat: bool = False,
2017
+ skip_chat_template: bool = False,
2018
+ request_id_prefix: str = "",
2019
+ no_oversample: bool = False,
2020
+ **kwargs,
2021
+ ) -> list:
2022
+ # load all data if needed
2023
+ self.num_available_samples = len(self.data)
2024
+ if num_requests <= 0:
2025
+ num_requests = self.num_available_samples
2026
+ logger.info(
2027
+ "num_requests is set to 0 or negative, "
2028
+ "so using all available samples: %d",
2029
+ num_requests,
2030
+ )
2031
+
2032
+ sampled_requests = []
2033
+ for i, item in enumerate(self.data):
2034
+ if len(sampled_requests) >= num_requests:
2035
+ break
2036
+ prompt = item["prompt"]
2037
+
2038
+ new_output_len = output_len
2039
+ if output_len is None or output_len == -1:
2040
+ # check that the request has an 'output_tokens' field
2041
+ if "output_tokens" not in item:
2042
+ raise ValueError(
2043
+ "If no output length is provided the "
2044
+ "custom dataset must contain an 'output_tokens' field."
2045
+ )
2046
+ # Use number of output tokens from the request data
2047
+ try:
2048
+ new_output_len = int(item["output_tokens"])
2049
+ except (ValueError, TypeError) as e:
2050
+ raise ValueError(
2051
+ f"Invalid value for 'output_tokens' in custom dataset: "
2052
+ f"'{item['output_tokens']}'. Must be an integer."
2053
+ ) from e
2054
+
2055
+ # apply template
2056
+ if not skip_chat_template:
2057
+ prompt = tokenizer.apply_chat_template(
2058
+ [{"role": "user", "content": prompt}],
2059
+ add_generation_prompt=True,
2060
+ tokenize=False,
2061
+ )
2062
+
2063
+ prompt_len = len(tokenizer(prompt).input_ids)
2064
+ sampled_requests.append(
2065
+ SampleRequest(
2066
+ prompt=prompt,
2067
+ prompt_len=prompt_len,
2068
+ expected_output_len=new_output_len,
2069
+ request_id=request_id_prefix + str(i),
2070
+ )
2071
+ )
2072
+ self.maybe_oversample_requests(
2073
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2074
+ )
2075
+
2076
+ return sampled_requests
2077
+
2078
+
2079
+ # -----------------------------------------------------------------------------
2080
+ # Spec Bench Dataset Implementation
2081
+ # -----------------------------------------------------------------------------
2082
+
2083
+
2084
+ class SpecBench(CustomDataset):
2085
+ """
2086
+ Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
2087
+ Download the dataset using:
2088
+ wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
2089
+ """ # noqa: E501
2090
+
2091
+ def __init__(self, **kwargs) -> None:
2092
+ self.category = kwargs.pop("category", None)
2093
+ super().__init__(**kwargs)
2094
+ self.load_data()
2095
+
2096
+ def load_data(self) -> None:
2097
+ if self.dataset_path is None:
2098
+ raise ValueError("dataset_path must be provided for loading data.")
2099
+
2100
+ self.data = []
2101
+
2102
+ # Load the JSONL file
2103
+ jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
2104
+
2105
+ # check if the JSONL file has a 'turns' column
2106
+ if "turns" not in jsonl_data.columns:
2107
+ raise ValueError("JSONL file must contain a 'turns' column.")
2108
+
2109
+ for _, row in jsonl_data.iterrows():
2110
+ # sample only from a specific category if specified
2111
+ if (not self.category) or (self.category == row["category"]):
2112
+ prompt = row["turns"][0]
2113
+ self.data.append({"prompt": prompt})
2114
+
2115
+ random.seed(self.random_seed)
2116
+ if not getattr(self, "disable_shuffle", False):
2117
+ random.shuffle(self.data)
2118
+
2119
+ def sample(self, **kwargs) -> list:
2120
+ # leverage CustomDataset sample
2121
+ return super().sample(**kwargs)
2122
+
2123
+
2124
+ # -----------------------------------------------------------------------------
2125
+ # Sonnet Dataset Implementation
2126
+ # -----------------------------------------------------------------------------
2127
+
2128
+
2129
+ @deprecated(
2130
+ "SonnetDataset is deprecated and will be removed in a future version.",
2131
+ )
2132
+ class SonnetDataset(BenchmarkDataset):
2133
+ """
2134
+ Simplified implementation of the Sonnet dataset. Loads poem lines from a
2135
+ text file and generates sample requests. Default values here copied from
2136
+ `benchmark_serving.py` for the sonnet dataset.
2137
+ """
2138
+
2139
+ DEFAULT_PREFIX_LEN = 200
2140
+ DEFAULT_INPUT_LEN = 550
2141
+ DEFAULT_OUTPUT_LEN = 150
2142
+
2143
+ def __init__(
2144
+ self,
2145
+ **kwargs,
2146
+ ) -> None:
2147
+ super().__init__(**kwargs)
2148
+ self.load_data()
2149
+
2150
+ def load_data(self) -> None:
2151
+ if not self.dataset_path:
2152
+ raise ValueError("dataset_path must be provided.")
2153
+ with open(self.dataset_path, encoding="utf-8") as f:
2154
+ self.data = f.readlines()
2155
+
2156
+ def sample(
2157
+ self,
2158
+ tokenizer: TokenizerLike,
2159
+ num_requests: int,
2160
+ prefix_len: int = DEFAULT_PREFIX_LEN,
2161
+ input_len: int = DEFAULT_INPUT_LEN,
2162
+ output_len: int = DEFAULT_OUTPUT_LEN,
2163
+ return_prompt_formatted: bool = False,
2164
+ request_id_prefix: str = "",
2165
+ no_oversample: bool = False,
2166
+ **kwargs,
2167
+ ) -> list:
2168
+ # Calculate average token length for a poem line.
2169
+ tokenized_lines = [tokenizer(line).input_ids for line in self.data]
2170
+ avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
2171
+
2172
+ # Build the base prompt.
2173
+ base_prompt = "Pick as many lines as you can from these poem lines:\n"
2174
+ base_msg = [{"role": "user", "content": base_prompt}]
2175
+ base_fmt = tokenizer.apply_chat_template(
2176
+ base_msg, add_generation_prompt=True, tokenize=False
2177
+ )
2178
+ base_offset = len(tokenizer(base_fmt).input_ids)
2179
+ if input_len <= base_offset:
2180
+ raise ValueError(
2181
+ f"'input_len' must be higher than the base prompt length "
2182
+ f"({base_offset})."
2183
+ )
2184
+
2185
+ # Determine how many poem lines to use.
2186
+ num_input_lines = round((input_len - base_offset) / avg_len)
2187
+ num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
2188
+ prefix_lines = self.data[:num_prefix_lines]
2189
+
2190
+ samples = []
2191
+ ind = 0
2192
+ while len(samples) < num_requests:
2193
+ extra_lines = random.choices(
2194
+ self.data, k=num_input_lines - num_prefix_lines
2195
+ )
2196
+ prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
2197
+ msg = [{"role": "user", "content": prompt}]
2198
+ prompt_formatted = tokenizer.apply_chat_template(
2199
+ msg, add_generation_prompt=True, tokenize=False
2200
+ )
2201
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
2202
+ if prompt_len <= input_len:
2203
+ samples.append(
2204
+ SampleRequest(
2205
+ prompt=prompt_formatted if return_prompt_formatted else prompt,
2206
+ prompt_len=prompt_len,
2207
+ expected_output_len=output_len,
2208
+ request_id=request_id_prefix + str(ind),
2209
+ )
2210
+ )
2211
+ ind += 1
2212
+ return samples
2213
+
2214
+
2215
+ # -----------------------------------------------------------------------------
2216
+ # BurstGPT Dataset Implementation
2217
+ # -----------------------------------------------------------------------------
2218
+
2219
+
2220
+ class BurstGPTDataset(BenchmarkDataset):
2221
+ """
2222
+ Implements the BurstGPT dataset. Loads data from a CSV file and generates
2223
+ sample requests based on synthetic prompt generation. Only rows with Model
2224
+ "GPT-4" and positive response tokens are used.
2225
+ """
2226
+
2227
+ def __init__(self, **kwargs) -> None:
2228
+ super().__init__(**kwargs)
2229
+ self.load_data()
2230
+
2231
+ def load_data(
2232
+ self,
2233
+ ):
2234
+ if self.dataset_path is None:
2235
+ raise ValueError("dataset_path must be provided for loading data.")
2236
+
2237
+ df = pd.read_csv(self.dataset_path)
2238
+ # Filter to keep only GPT-4 rows.
2239
+ gpt4_df = df[df["Model"] == "GPT-4"]
2240
+ # Remove failed requests (where Response tokens is 0 or less).
2241
+ gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
2242
+ # Sample the desired number of rows.
2243
+ self.data = gpt4_df
2244
+
2245
+ def _sample_loaded_data(self, num_requests: int) -> list:
2246
+ if num_requests <= len(self.data):
2247
+ data = self.data.sample(n=num_requests, random_state=self.random_seed)
2248
+ else:
2249
+ data = self.data.sample(
2250
+ n=num_requests,
2251
+ random_state=self.random_seed,
2252
+ replace=True,
2253
+ )
2254
+ # Convert the dataframe to a list of lists.
2255
+ return data.values.tolist()
2256
+
2257
+ def sample(
2258
+ self,
2259
+ tokenizer: TokenizerLike,
2260
+ num_requests: int,
2261
+ max_loras: int | None = None,
2262
+ lora_path: str | None = None,
2263
+ request_id_prefix: str = "",
2264
+ no_oversample: bool = False,
2265
+ **kwargs,
2266
+ ) -> list[SampleRequest]:
2267
+ samples = []
2268
+ data = self._sample_loaded_data(num_requests=num_requests)
2269
+ for i in range(num_requests):
2270
+ input_len = int(data[i][2])
2271
+ output_len = int(data[i][3])
2272
+ lora_req = self.get_random_lora_request(
2273
+ max_loras=max_loras, lora_path=lora_path
2274
+ )
2275
+ vocab_size = tokenizer.vocab_size
2276
+ # Generate a synthetic prompt: a list of token IDs computed as (i +
2277
+ # j) modulo vocab_size.
2278
+ token_ids = [(i + j) % vocab_size for j in range(input_len)]
2279
+ prompt = tokenizer.decode(token_ids)
2280
+ samples.append(
2281
+ SampleRequest(
2282
+ prompt=prompt,
2283
+ prompt_len=input_len,
2284
+ expected_output_len=output_len,
2285
+ lora_request=lora_req,
2286
+ request_id=request_id_prefix + str(i),
2287
+ )
2288
+ )
2289
+ return samples
2290
+
2291
+
2292
+ # -----------------------------------------------------------------------------
2293
+ # HuggingFace Dataset Base Implementation
2294
+ # -----------------------------------------------------------------------------
2295
+ class HuggingFaceDataset(BenchmarkDataset):
2296
+ """Base class for datasets hosted on HuggingFace."""
2297
+
2298
+ SUPPORTED_DATASET_PATHS: set[str] | dict[str, Callable] = set()
2299
+
2300
+ def __init__(
2301
+ self,
2302
+ dataset_path: str,
2303
+ dataset_split: str,
2304
+ no_stream: bool = False,
2305
+ dataset_subset: str | None = None,
2306
+ hf_name: str | None = None,
2307
+ **kwargs,
2308
+ ) -> None:
2309
+ super().__init__(dataset_path=dataset_path, **kwargs)
2310
+
2311
+ self.dataset_split = dataset_split
2312
+ self.dataset_subset = dataset_subset
2313
+ self.load_stream = not no_stream
2314
+ self.hf_name = hf_name or dataset_path
2315
+ self.load_data()
2316
+
2317
+ def load_data(self) -> None:
2318
+ """Load data from HuggingFace datasets."""
2319
+ self.data = load_dataset(
2320
+ self.dataset_path,
2321
+ name=self.dataset_subset,
2322
+ split=self.dataset_split,
2323
+ streaming=self.load_stream,
2324
+ )
2325
+ if not getattr(self, "disable_shuffle", False):
2326
+ self.data = self.data.shuffle(seed=self.random_seed)
2327
+
2328
+
2329
+ # -----------------------------------------------------------------------------
2330
+ # Conversation Dataset Implementation
2331
+ # -----------------------------------------------------------------------------
2332
+
2333
+
2334
+ class ConversationDataset(HuggingFaceDataset):
2335
+ """Dataset for text-only conversation data."""
2336
+
2337
+ SUPPORTED_DATASET_PATHS = {
2338
+ "Aeala/ShareGPT_Vicuna_unfiltered",
2339
+ }
2340
+ IS_MULTIMODAL = False
2341
+
2342
+ def sample(
2343
+ self,
2344
+ tokenizer: TokenizerLike,
2345
+ num_requests: int,
2346
+ output_len: int | None = None,
2347
+ enable_multimodal_chat: bool = False,
2348
+ request_id_prefix: str = "",
2349
+ no_oversample: bool = False,
2350
+ **kwargs,
2351
+ ) -> list:
2352
+ # Filter examples with at least 2 conversations
2353
+ filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
2354
+ sampled_requests = []
2355
+ ind = 0
2356
+ dynamic_output = output_len is None
2357
+
2358
+ for item in filtered_data:
2359
+ if len(sampled_requests) >= num_requests:
2360
+ break
2361
+ conv = item["conversations"]
2362
+ prompt, completion = conv[0]["value"], conv[1]["value"]
2363
+
2364
+ prompt_ids = tokenizer(prompt).input_ids
2365
+ completion_ids = tokenizer(completion).input_ids
2366
+ prompt_len = len(prompt_ids)
2367
+ completion_len = len(completion_ids)
2368
+ output_len = completion_len if dynamic_output else output_len
2369
+ assert isinstance(output_len, int) and output_len > 0
2370
+ if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
2371
+ continue
2372
+ mm_content = process_image(item["image"]) if "image" in item else None
2373
+ if enable_multimodal_chat:
2374
+ # Note: when chat is enabled the request prompt_len is no longer
2375
+ # accurate and we will be using request output to count the
2376
+ # actual prompt len and output len
2377
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2378
+ sampled_requests.append(
2379
+ SampleRequest(
2380
+ prompt=prompt,
2381
+ prompt_len=prompt_len,
2382
+ expected_output_len=output_len,
2383
+ multi_modal_data=mm_content,
2384
+ request_id=request_id_prefix + str(ind),
2385
+ )
2386
+ )
2387
+ ind += 1
2388
+ self.maybe_oversample_requests(
2389
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2390
+ )
2391
+ return sampled_requests
2392
+
2393
+
2394
+ class MultiModalConversationDataset(HuggingFaceDataset):
2395
+ """Dataset for multimodal conversation data."""
2396
+
2397
+ SUPPORTED_DATASET_PATHS = {
2398
+ "lmms-lab/LLaVA-OneVision-Data",
2399
+ }
2400
+ IS_MULTIMODAL = True
2401
+
2402
+ def sample(
2403
+ self,
2404
+ tokenizer: TokenizerLike,
2405
+ num_requests: int,
2406
+ output_len: int | None = None,
2407
+ enable_multimodal_chat: bool = False,
2408
+ request_id_prefix: str = "",
2409
+ no_oversample: bool = False,
2410
+ **kwargs,
2411
+ ) -> list:
2412
+ # Filter examples with at least 2 conversations
2413
+ filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
2414
+ sampled_requests = []
2415
+ ind = 0
2416
+ dynamic_output = output_len is None
2417
+
2418
+ for item in filtered_data:
2419
+ if len(sampled_requests) >= num_requests:
2420
+ break
2421
+ conv = item["conversations"]
2422
+ prompt, completion = conv[0]["value"], conv[1]["value"]
2423
+
2424
+ prompt_ids = tokenizer(prompt).input_ids
2425
+ completion_ids = tokenizer(completion).input_ids
2426
+ prompt_len = len(prompt_ids)
2427
+ completion_len = len(completion_ids)
2428
+ output_len = completion_len if dynamic_output else output_len
2429
+ assert isinstance(output_len, int) and output_len > 0
2430
+ if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
2431
+ continue
2432
+ mm_content = process_image(item["image"]) if "image" in item else None
2433
+ if enable_multimodal_chat:
2434
+ # Note: when chat is enabled the request prompt_len is no longer
2435
+ # accurate and we will be using request output to count the
2436
+ # actual prompt len and output len
2437
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2438
+ sampled_requests.append(
2439
+ SampleRequest(
2440
+ prompt=prompt,
2441
+ prompt_len=prompt_len,
2442
+ expected_output_len=output_len,
2443
+ multi_modal_data=mm_content,
2444
+ request_id=request_id_prefix + str(ind),
2445
+ )
2446
+ )
2447
+ ind += 1
2448
+ self.maybe_oversample_requests(
2449
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2450
+ )
2451
+ return sampled_requests
2452
+
2453
+
2454
+ # -----------------------------------------------------------------------------
2455
+ # Vision Arena Dataset Implementation
2456
+ # -----------------------------------------------------------------------------
2457
+
2458
+
2459
+ class VisionArenaDataset(HuggingFaceDataset):
2460
+ """
2461
+ Vision Arena Dataset.
2462
+ """
2463
+
2464
+ DEFAULT_OUTPUT_LEN = 128
2465
+ SUPPORTED_DATASET_PATHS = {
2466
+ "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
2467
+ "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
2468
+ }
2469
+ IS_MULTIMODAL = True
2470
+
2471
+ def sample(
2472
+ self,
2473
+ tokenizer: TokenizerLike,
2474
+ num_requests: int,
2475
+ output_len: int | None = None,
2476
+ enable_multimodal_chat: bool = False,
2477
+ request_id_prefix: str = "",
2478
+ no_oversample: bool = False,
2479
+ **kwargs,
2480
+ ) -> list:
2481
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2482
+ sampled_requests = []
2483
+ for i, item in enumerate(self.data):
2484
+ if len(sampled_requests) >= num_requests:
2485
+ break
2486
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2487
+ if parser_fn is None:
2488
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2489
+ prompt = parser_fn(item)
2490
+ mm_content = process_image(item["images"][0])
2491
+ prompt_len = len(tokenizer(prompt).input_ids)
2492
+ if enable_multimodal_chat:
2493
+ # Note: when chat is enabled the request prompt_len is no longer
2494
+ # accurate and we will be using request output to count the
2495
+ # actual prompt len
2496
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2497
+ sampled_requests.append(
2498
+ SampleRequest(
2499
+ prompt=prompt,
2500
+ prompt_len=prompt_len,
2501
+ expected_output_len=output_len,
2502
+ multi_modal_data=mm_content,
2503
+ request_id=request_id_prefix + str(i),
2504
+ )
2505
+ )
2506
+ self.maybe_oversample_requests(
2507
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2508
+ )
2509
+ return sampled_requests
2510
+
2511
+
2512
+ class MMVUDataset(HuggingFaceDataset):
2513
+ """
2514
+ MMVU Dataset.
2515
+ https://huggingface.co/datasets/yale-nlp/MMVU
2516
+ """
2517
+
2518
+ DEFAULT_OUTPUT_LEN = 128
2519
+ SUPPORTED_DATASET_PATHS = {
2520
+ "yale-nlp/MMVU": lambda x: x["question"]
2521
+ + " "
2522
+ + (" ".join(f"{k}.{v}" for k, v in x["choices"].items())),
2523
+ }
2524
+
2525
+ def sample(
2526
+ self,
2527
+ tokenizer: TokenizerLike,
2528
+ num_requests: int,
2529
+ output_len: int | None = None,
2530
+ enable_multimodal_chat: bool = False,
2531
+ request_id_prefix: str = "",
2532
+ no_oversample: bool = False,
2533
+ **kwargs,
2534
+ ) -> list:
2535
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2536
+ sampled_requests = []
2537
+ for i, item in enumerate(self.data):
2538
+ if len(sampled_requests) >= num_requests:
2539
+ break
2540
+ parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
2541
+ if parser_fn is None:
2542
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2543
+ prompt = parser_fn(item)
2544
+ mm_content = process_video(item["video"])
2545
+ prompt_len = len(tokenizer(prompt).input_ids)
2546
+ if enable_multimodal_chat:
2547
+ # Note: when chat is enabled the request prompt_len is no longer
2548
+ # accurate and we will be using request output to count the
2549
+ # actual prompt len
2550
+ prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
2551
+ sampled_requests.append(
2552
+ SampleRequest(
2553
+ prompt=prompt,
2554
+ prompt_len=prompt_len,
2555
+ expected_output_len=output_len,
2556
+ multi_modal_data=mm_content,
2557
+ request_id=request_id_prefix + str(i),
2558
+ )
2559
+ )
2560
+ self.maybe_oversample_requests(
2561
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2562
+ )
2563
+ return sampled_requests
2564
+
2565
+
2566
+ # -----------------------------------------------------------------------------
2567
+ # Instruct Coder Dataset Implementation
2568
+ # -----------------------------------------------------------------------------
2569
+
2570
+
2571
+ class InstructCoderDataset(HuggingFaceDataset):
2572
+ """
2573
+ InstructCoder Dataset.
2574
+ https://huggingface.co/datasets/likaixin/InstructCoder
2575
+
2576
+ InstructCoder is the dataset designed for general code editing. It consists
2577
+ of 114,239 instruction-input-output triplets, and covers multiple distinct
2578
+ code editing scenario.
2579
+ """
2580
+
2581
+ DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
2582
+ SUPPORTED_DATASET_PATHS = {
2583
+ "likaixin/InstructCoder",
2584
+ }
2585
+
2586
+ def sample(
2587
+ self,
2588
+ tokenizer: TokenizerLike,
2589
+ num_requests: int,
2590
+ output_len: int | None = None,
2591
+ enable_multimodal_chat: bool = False,
2592
+ skip_chat_template: bool = False,
2593
+ request_id_prefix: str = "",
2594
+ no_oversample: bool = False,
2595
+ **kwargs,
2596
+ ) -> list:
2597
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2598
+ sampled_requests = []
2599
+ for i, item in enumerate(self.data):
2600
+ if len(sampled_requests) >= num_requests:
2601
+ break
2602
+ prompt = (
2603
+ f"{item['input']}\n\n{item['instruction']} Just output "
2604
+ "the code, do not include any explanation."
2605
+ )
2606
+
2607
+ # apply template
2608
+ if not skip_chat_template:
2609
+ prompt = tokenizer.apply_chat_template(
2610
+ [{"role": "user", "content": prompt}],
2611
+ add_generation_prompt=True,
2612
+ tokenize=False,
2613
+ )
2614
+
2615
+ prompt_len = len(tokenizer(prompt).input_ids)
2616
+ sampled_requests.append(
2617
+ SampleRequest(
2618
+ prompt=prompt,
2619
+ prompt_len=prompt_len,
2620
+ expected_output_len=output_len,
2621
+ request_id=request_id_prefix + str(i),
2622
+ )
2623
+ )
2624
+ self.maybe_oversample_requests(
2625
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2626
+ )
2627
+ return sampled_requests
2628
+
2629
+
2630
+ # -----------------------------------------------------------------------------
2631
+ # MT-Bench Dataset Implementation
2632
+ # -----------------------------------------------------------------------------
2633
+
2634
+
2635
+ class MTBenchDataset(HuggingFaceDataset):
2636
+ """
2637
+ MT-Bench Dataset.
2638
+ https://huggingface.co/datasets/philschmid/mt-bench
2639
+
2640
+ We create a single turn dataset for MT-Bench.
2641
+ This is similar to Spec decoding benchmark setup in vLLM
2642
+ https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
2643
+ """ # noqa: E501
2644
+
2645
+ DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
2646
+ SUPPORTED_DATASET_PATHS = {
2647
+ "philschmid/mt-bench",
2648
+ }
2649
+
2650
+ def sample(
2651
+ self,
2652
+ tokenizer: TokenizerLike,
2653
+ num_requests: int,
2654
+ output_len: int | None = None,
2655
+ enable_multimodal_chat: bool = False,
2656
+ skip_chat_template: bool = False,
2657
+ request_id_prefix: str = "",
2658
+ no_oversample: bool = False,
2659
+ **kwargs,
2660
+ ) -> list:
2661
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2662
+ sampled_requests = []
2663
+
2664
+ for i, item in enumerate(self.data):
2665
+ if len(sampled_requests) >= num_requests:
2666
+ break
2667
+ prompt = item["turns"][0]
2668
+
2669
+ # apply template
2670
+ if not skip_chat_template:
2671
+ prompt = tokenizer.apply_chat_template(
2672
+ [{"role": "user", "content": prompt}],
2673
+ add_generation_prompt=True,
2674
+ tokenize=False,
2675
+ )
2676
+
2677
+ prompt_len = len(tokenizer(prompt).input_ids)
2678
+ sampled_requests.append(
2679
+ SampleRequest(
2680
+ prompt=prompt,
2681
+ prompt_len=prompt_len,
2682
+ expected_output_len=output_len,
2683
+ request_id=request_id_prefix + str(i),
2684
+ )
2685
+ )
2686
+ self.maybe_oversample_requests(
2687
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2688
+ )
2689
+ return sampled_requests
2690
+
2691
+
2692
+ # -----------------------------------------------------------------------------
2693
+ # Blazedit Dataset Implementation
2694
+ # -----------------------------------------------------------------------------
2695
+
2696
+
2697
+ class BlazeditDataset(HuggingFaceDataset):
2698
+ """
2699
+ Blazedit Dataset.
2700
+ https://github.com/ise-uiuc/blazedit
2701
+
2702
+ 5k char version: vdaita/edit_5k_char
2703
+ 10k char version: vdaita/edit_10k_char
2704
+ """ # noqa: E501
2705
+
2706
+ # 5k char version will have output as ~5k chars
2707
+ # 10k char version will have output as ~10k chars
2708
+ # Assuming 3 char per token, 10k chars will be 3333 tokens
2709
+ # We set default to 4000 to be safe
2710
+ DEFAULT_OUTPUT_LEN = 4000
2711
+ SUPPORTED_DATASET_PATHS = {
2712
+ "vdaita/edit_5k_char",
2713
+ "vdaita/edit_10k_char",
2714
+ }
2715
+
2716
+ def sample(
2717
+ self,
2718
+ tokenizer: TokenizerLike,
2719
+ num_requests: int,
2720
+ output_len: int | None = None,
2721
+ skip_chat_template: bool = False,
2722
+ request_id_prefix: str = "",
2723
+ no_oversample: bool = False,
2724
+ min_distance: float = 0.0,
2725
+ max_distance: float = 1.0,
2726
+ **kwargs,
2727
+ ) -> list:
2728
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2729
+ sampled_requests = []
2730
+
2731
+ for i, item in enumerate(self.data):
2732
+ if len(sampled_requests) >= num_requests:
2733
+ break
2734
+ code = item["code"]
2735
+ change_request = item["change_request"]
2736
+ norm_distance = item["norm_distance"]
2737
+
2738
+ # compare the levenshtein distance normalized by code length
2739
+ if norm_distance < min_distance or norm_distance > max_distance:
2740
+ continue
2741
+
2742
+ # template copied from
2743
+ # https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
2744
+ prompt = f"""Given a code file, please apply the change requests and generate the new file.
2745
+
2746
+ Original file:
2747
+ ```python
2748
+ {code}
2749
+ ```
2750
+
2751
+ Change request:
2752
+ {change_request}
2753
+
2754
+ Please generate the new code file in the "New file" section below.""" # noqa: E501
2755
+
2756
+ # apply template
2757
+ if not skip_chat_template:
2758
+ prompt = tokenizer.apply_chat_template(
2759
+ [{"role": "user", "content": prompt}],
2760
+ add_generation_prompt=True,
2761
+ tokenize=False,
2762
+ )
2763
+
2764
+ prompt_len = len(tokenizer(prompt).input_ids)
2765
+
2766
+ sampled_requests.append(
2767
+ SampleRequest(
2768
+ prompt=prompt,
2769
+ prompt_len=prompt_len,
2770
+ expected_output_len=output_len,
2771
+ request_id=request_id_prefix + str(i),
2772
+ )
2773
+ )
2774
+ self.maybe_oversample_requests(
2775
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2776
+ )
2777
+
2778
+ return sampled_requests
2779
+
2780
+
2781
+ # -----------------------------------------------------------------------------
2782
+ # AIMO Dataset Implementation
2783
+ # -----------------------------------------------------------------------------
2784
+
2785
+
2786
+ class AIMODataset(HuggingFaceDataset):
2787
+ """
2788
+ Dataset class for processing a AIMO dataset with reasoning questions.
2789
+ """
2790
+
2791
+ SUPPORTED_DATASET_PATHS = {
2792
+ "AI-MO/aimo-validation-aime",
2793
+ "AI-MO/NuminaMath-1.5",
2794
+ "AI-MO/NuminaMath-CoT",
2795
+ }
2796
+
2797
+ def sample(
2798
+ self,
2799
+ tokenizer: TokenizerLike,
2800
+ num_requests: int,
2801
+ output_len: int | None = None,
2802
+ request_id_prefix: str = "",
2803
+ no_oversample: bool = False,
2804
+ **kwargs,
2805
+ ) -> list:
2806
+ sampled_requests = []
2807
+ ind = 0
2808
+ dynamic_output = output_len is None
2809
+
2810
+ for item in self.data:
2811
+ if len(sampled_requests) >= num_requests:
2812
+ break
2813
+ prompt, completion = item["problem"], item["solution"]
2814
+
2815
+ prompt_ids = tokenizer(prompt).input_ids
2816
+ completion_ids = tokenizer(completion).input_ids
2817
+ prompt_len = len(prompt_ids)
2818
+ completion_len = len(completion_ids)
2819
+ output_len = completion_len if dynamic_output else output_len
2820
+ assert isinstance(output_len, int) and output_len > 0
2821
+ if dynamic_output and not is_valid_sequence(
2822
+ prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
2823
+ ):
2824
+ continue
2825
+ sampled_requests.append(
2826
+ SampleRequest(
2827
+ prompt=prompt,
2828
+ prompt_len=prompt_len,
2829
+ expected_output_len=output_len,
2830
+ multi_modal_data=None,
2831
+ request_id=request_id_prefix + str(ind),
2832
+ )
2833
+ )
2834
+ ind += 1
2835
+ self.maybe_oversample_requests(
2836
+ sampled_requests, num_requests, request_id_prefix, no_oversample
2837
+ )
2838
+ return sampled_requests
2839
+
2840
+
2841
+ # -----------------------------------------------------------------------------
2842
+ # Next Edit Prediction Dataset Implementation
2843
+ # -----------------------------------------------------------------------------
2844
+
2845
+
2846
+ zeta_prompt = """### Instruction:
2847
+ You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
2848
+
2849
+ ### User Edits:
2850
+
2851
+ {}
2852
+
2853
+ ### User Excerpt:
2854
+
2855
+ {}
2856
+
2857
+ ### Response:
2858
+
2859
+ """ # noqa: E501
2860
+
2861
+
2862
+ def _format_zeta_prompt(
2863
+ sample: dict, original_start_marker: str = "<|editable_region_start|>"
2864
+ ) -> dict:
2865
+ """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
2866
+
2867
+ This function formats examples from the NEP dataset
2868
+ into prompts and expected outputs. It could be
2869
+ further extended to support more NEP datasets.
2870
+
2871
+ Args:
2872
+ sample: The dataset sample containing events,
2873
+ inputs, and outputs.
2874
+ original_start_marker: The marker indicating the
2875
+ start of the editable region. Defaults to
2876
+ "<|editable_region_start|>".
2877
+
2878
+ Returns:
2879
+ A dictionary with the formatted prompts and expected outputs.
2880
+ """
2881
+ events = sample["events"]
2882
+ input = sample["input"]
2883
+ output = sample["output"]
2884
+ prompt = zeta_prompt.format(events, input)
2885
+
2886
+ # following the original implementation, extract the focused region
2887
+ # from the raw output
2888
+ output_start_index = output.find(original_start_marker)
2889
+ output_focused_region = output[output_start_index:]
2890
+ expected_output = output_focused_region
2891
+
2892
+ return {"prompt": prompt, "expected_output": expected_output}
2893
+
2894
+
2895
+ class NextEditPredictionDataset(HuggingFaceDataset):
2896
+ """
2897
+ Dataset class for processing a Next Edit Prediction dataset.
2898
+ """
2899
+
2900
+ SUPPORTED_DATASET_PATHS = {
2901
+ "zed-industries/zeta",
2902
+ }
2903
+ MAPPING_PROMPT_FUNCS = {
2904
+ "zed-industries/zeta": _format_zeta_prompt,
2905
+ }
2906
+
2907
+ def sample(
2908
+ self,
2909
+ tokenizer: TokenizerLike,
2910
+ num_requests: int,
2911
+ request_id_prefix: str = "",
2912
+ no_oversample: bool = False,
2913
+ **kwargs,
2914
+ ):
2915
+ formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
2916
+ if formatting_prompt_func is None:
2917
+ raise ValueError(f"Unsupported dataset path: {self.hf_name}")
2918
+ samples = []
2919
+ for i, sample in enumerate(self.data):
2920
+ sample = formatting_prompt_func(sample)
2921
+ samples.append(
2922
+ SampleRequest(
2923
+ prompt=sample["prompt"],
2924
+ prompt_len=len(tokenizer(sample["prompt"]).input_ids),
2925
+ expected_output_len=len(
2926
+ tokenizer(sample["expected_output"]).input_ids
2927
+ ),
2928
+ request_id=request_id_prefix + str(i),
2929
+ )
2930
+ )
2931
+ if len(samples) >= num_requests:
2932
+ break
2933
+ self.maybe_oversample_requests(
2934
+ samples, num_requests, request_id_prefix, no_oversample
2935
+ )
2936
+ return samples
2937
+
2938
+
2939
+ # -----------------------------------------------------------------------------
2940
+ # ASR Dataset Implementation
2941
+ # -----------------------------------------------------------------------------
2942
+
2943
+
2944
+ class ASRDataset(HuggingFaceDataset):
2945
+ """
2946
+ Dataset class for processing a ASR dataset for transcription.
2947
+ Tested on the following set:
2948
+
2949
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2950
+ | Dataset | Domain | Speaking Style | hf-subset |
2951
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2952
+ | TED-LIUM | TED talks | Oratory | release1, release2, release3|
2953
+ | | | | release3-speaker-adaptation |
2954
+ | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
2955
+ | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
2956
+ | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
2957
+ | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
2958
+ | AMI | Meetings | Spontaneous | ihm, sdm |
2959
+ +----------------+----------------------------------------+--------------------------+-----------------------------+
2960
+
2961
+ """ # noqa: E501
2962
+
2963
+ SUPPORTED_DATASET_PATHS = {
2964
+ "openslr/librispeech_asr",
2965
+ "facebook/voxpopuli",
2966
+ "LIUM/tedlium",
2967
+ "edinburghcstr/ami",
2968
+ "speechcolab/gigaspeech",
2969
+ "kensho/spgispeech",
2970
+ }
2971
+
2972
+ DEFAULT_OUTPUT_LEN = 128
2973
+ IS_MULTIMODAL = True
2974
+
2975
+ # TODO Whisper-specific. Abstract interface when more models are supported.
2976
+ TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
2977
+ skip_long_audios: bool = True
2978
+
2979
+ def sample(
2980
+ self,
2981
+ tokenizer: TokenizerLike,
2982
+ num_requests: int,
2983
+ output_len: int | None = None,
2984
+ request_id_prefix: str = "",
2985
+ no_oversample: bool = False,
2986
+ **kwargs,
2987
+ ) -> list:
2988
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
2989
+ prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
2990
+ prompt_len = len(tokenizer(prompt).input_ids)
2991
+ sampled_requests = []
2992
+ ind = 0
2993
+ skipped = 0
2994
+ for item in self.data:
2995
+ if len(sampled_requests) >= num_requests:
2996
+ break
2997
+ audio = item["audio"]
2998
+ y, sr = audio["array"], audio["sampling_rate"]
2999
+ duration_s = librosa.get_duration(y=y, sr=sr)
3000
+ # Whisper max supported duration
3001
+ if self.skip_long_audios and duration_s > 30:
3002
+ skipped += 1
3003
+ continue
3004
+
3005
+ mm_content = {"audio": (y, sr)}
3006
+ sampled_requests.append(
3007
+ SampleRequest(
3008
+ prompt=prompt,
3009
+ prompt_len=prompt_len,
3010
+ expected_output_len=output_len,
3011
+ multi_modal_data=mm_content,
3012
+ request_id=request_id_prefix + str(ind),
3013
+ )
3014
+ )
3015
+ ind += 1
3016
+ if skipped:
3017
+ logger.warning(
3018
+ "%d samples discarded from dataset due to"
3019
+ " their length being greater than"
3020
+ " what Whisper supports.",
3021
+ skipped,
3022
+ )
3023
+ self.maybe_oversample_requests(
3024
+ sampled_requests, num_requests, request_id_prefix, no_oversample
3025
+ )
3026
+ return sampled_requests
3027
+
3028
+
3029
+ # -----------------------------------------------------------------------------
3030
+ # MLPerf Dataset Implementation
3031
+ # -----------------------------------------------------------------------------
3032
+
3033
+
3034
+ class MLPerfDataset(HuggingFaceDataset):
3035
+ """
3036
+ MLPerf Inference Dataset.
3037
+
3038
+ Dataset on HF:
3039
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama2-data
3040
+ https://huggingface.co/datasets/mgoin/mlperf-inference-llama3.1-data
3041
+
3042
+ Each record contains:
3043
+ - "system_prompt": system role instruction.
3044
+ - "question": user question.
3045
+ - "output": reference answer.
3046
+
3047
+ We combine the system prompt and question into a chat-formatted prompt
3048
+ (using the tokenizer's chat template) and set the expected output length to
3049
+ the tokenized length of the provided reference answer.
3050
+ """
3051
+
3052
+ SUPPORTED_DATASET_PATHS = {
3053
+ "mgoin/mlperf-inference-llama2-data",
3054
+ "mgoin/mlperf-inference-llama3.1-data",
3055
+ }
3056
+
3057
+ def sample(
3058
+ self,
3059
+ tokenizer: TokenizerLike,
3060
+ num_requests: int,
3061
+ output_len: int | None = None,
3062
+ request_id_prefix: str = "",
3063
+ no_oversample: bool = False,
3064
+ **kwargs,
3065
+ ) -> list[SampleRequest]:
3066
+ # Force dynamic output length based on reference completion.
3067
+ dynamic_output = output_len is None
3068
+ sampled_requests: list[SampleRequest] = []
3069
+ ind = 0
3070
+
3071
+ for item in self.data:
3072
+ if len(sampled_requests) >= num_requests:
3073
+ break
3074
+
3075
+ system_prompt = item["system_prompt"]
3076
+ question = item["question"]
3077
+ reference_answer = item["output"]
3078
+
3079
+ # Build chat-style prompt using tokenizer template, if available.
3080
+ messages = [
3081
+ {"role": "system", "content": system_prompt},
3082
+ {"role": "user", "content": question},
3083
+ ]
3084
+ prompt_formatted = tokenizer.apply_chat_template(
3085
+ messages, add_generation_prompt=True, tokenize=False
3086
+ )
3087
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
3088
+
3089
+ # Determine output length from reference answer tokens.
3090
+ ref_out_len = len(
3091
+ tokenizer(reference_answer, add_special_tokens=False).input_ids
3092
+ )
3093
+ expected_output_len = ref_out_len if dynamic_output else output_len
3094
+
3095
+ # Validate sequence lengths.
3096
+ if not is_valid_sequence(prompt_len, expected_output_len):
3097
+ continue
3098
+
3099
+ sampled_requests.append(
3100
+ SampleRequest(
3101
+ prompt=prompt_formatted,
3102
+ prompt_len=prompt_len,
3103
+ expected_output_len=expected_output_len,
3104
+ request_id=request_id_prefix + str(ind),
3105
+ )
3106
+ )
3107
+ ind += 1
3108
+
3109
+ self.maybe_oversample_requests(
3110
+ sampled_requests, num_requests, request_id_prefix, no_oversample
3111
+ )
3112
+ return sampled_requests
3113
+
3114
+
3115
+ # -----------------------------------------------------------------------------
3116
+ # Prefix Repetition Dataset Implementation
3117
+ # -----------------------------------------------------------------------------
3118
+
3119
+
3120
+ class PrefixRepetitionRandomDataset(BenchmarkDataset):
3121
+ # Default values copied from benchmark_serving.py for the repeated prefix
3122
+ # dataset.
3123
+ DEFAULT_PREFIX_LEN = 256
3124
+ DEFAULT_SUFFIX_LEN = 256
3125
+ DEFAULT_NUM_PREFIXES = 10
3126
+ DEFAULT_OUTPUT_LEN = 128
3127
+
3128
+ def __init__(
3129
+ self,
3130
+ **kwargs,
3131
+ ) -> None:
3132
+ super().__init__(**kwargs)
3133
+ random.seed(self.random_seed)
3134
+ np.random.seed(self.random_seed)
3135
+
3136
+ def sample(
3137
+ self,
3138
+ tokenizer: TokenizerLike,
3139
+ num_requests: int,
3140
+ prefix_len: int = DEFAULT_PREFIX_LEN,
3141
+ suffix_len: int = DEFAULT_SUFFIX_LEN,
3142
+ num_prefixes: int = DEFAULT_NUM_PREFIXES,
3143
+ output_len: int = DEFAULT_OUTPUT_LEN,
3144
+ request_id_prefix: str = "",
3145
+ no_oversample: bool = False,
3146
+ **kwargs,
3147
+ ) -> list[SampleRequest]:
3148
+ vocab_size = tokenizer.vocab_size
3149
+ prompts_per_prefix = num_requests // num_prefixes
3150
+ if prompts_per_prefix == 0:
3151
+ raise ValueError(
3152
+ f"num_requests ({num_requests}) must be greater than or equal "
3153
+ f"to num_prefixes ({num_prefixes})"
3154
+ )
3155
+
3156
+ def _generate_exact_length_tokens(target_length: int) -> list[int]:
3157
+ """Generate tokens that decode and re-encode to exactly
3158
+ target_length."""
3159
+ # Generate random tokens
3160
+ tokens = np.random.randint(0, vocab_size, size=target_length).tolist()
3161
+
3162
+ _, adjusted_tokens, token_mismatch = gen_prompt_decode_to_target_len( # noqa: E501
3163
+ tokenizer=tokenizer,
3164
+ token_sequence=tokens,
3165
+ target_token_len=target_length,
3166
+ add_special_tokens=False,
3167
+ )
3168
+ return adjusted_tokens, token_mismatch
3169
+
3170
+ requests = []
3171
+ token_mismatch_total = 0
3172
+ for _ in range(num_prefixes):
3173
+ prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len)
3174
+ token_mismatch_total += prefix_mismatch
3175
+
3176
+ for _ in range(prompts_per_prefix):
3177
+ suffix_tokens, suffix_mismatch = _generate_exact_length_tokens(
3178
+ suffix_len
3179
+ )
3180
+ token_mismatch_total += suffix_mismatch
3181
+ combined_tokens = prefix_tokens + suffix_tokens
3182
+ prompt = tokenizer.decode(combined_tokens)
3183
+ prompt_len = len(combined_tokens)
3184
+ requests.append(
3185
+ SampleRequest(
3186
+ prompt=prompt,
3187
+ prompt_len=prompt_len,
3188
+ expected_output_len=output_len,
3189
+ )
3190
+ )
3191
+
3192
+ if token_mismatch_total != 0:
3193
+ sign = "more" if token_mismatch_total > 0 else "fewer"
3194
+ logger.warning(
3195
+ "Across all generated prompts, there were %d %s tokens "
3196
+ "than expected after decoding and re-encoding. This is "
3197
+ "expected due to the imperfect nature of the sampling "
3198
+ "procedure.",
3199
+ abs(token_mismatch_total),
3200
+ sign,
3201
+ )
3202
+ if not getattr(self, "disable_shuffle", False):
3203
+ random.shuffle(requests)
3204
+ return requests
3205
+
3206
+
3207
+ # -----------------------------------------------------------------------------
3208
+ # MMStar Dataset Implementation
3209
+ # -----------------------------------------------------------------------------
3210
+
3211
+
3212
+ class MMStarDataset(HuggingFaceDataset):
3213
+ """
3214
+ Lin-Chen/MMStar: https://huggingface.co/datasets/Lin-Chen/MMStar
3215
+ refer to: https://github.com/sgl-project/SpecForge/pull/106
3216
+ """
3217
+
3218
+ DEFAULT_OUTPUT_LEN = 128
3219
+ SUPPORTED_DATASET_PATHS = {"Lin-Chen/MMStar"}
3220
+ IS_MULTIMODAL = True
3221
+
3222
+ def sample(
3223
+ self,
3224
+ tokenizer: TokenizerLike,
3225
+ num_requests: int,
3226
+ output_len: int | None = None,
3227
+ enable_multimodal_chat: bool = False,
3228
+ request_id_prefix: str = "",
3229
+ no_oversample: bool = False,
3230
+ **kwargs,
3231
+ ) -> list[SampleRequest]:
3232
+ # If --hf-output-len is not set, use the default output length.
3233
+ output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
3234
+ sampled_requests: list[SampleRequest] = []
3235
+
3236
+ for ind, item in enumerate(self.data):
3237
+ if len(sampled_requests) >= num_requests:
3238
+ break
3239
+ # Split the question text from options
3240
+ # (keep only the part before "Options:").
3241
+ full_q: str = item.get("question", "")
3242
+ question_text = full_q.split("Options:", 1)[0].strip()
3243
+
3244
+ # Multimodal image content.
3245
+ mm_content = process_image(item["image"])
3246
+
3247
+ # Compute prompt token length (note: this is plain text length
3248
+ # if enable_multimodal_chat is False).
3249
+ prompt_len = len(tokenizer(question_text).input_ids)
3250
+
3251
+ if enable_multimodal_chat:
3252
+ # If multimodal content should be embedded in the chat message,
3253
+ # convert to [{"role":"user","content":[...]}]
3254
+ prompt = self.apply_multimodal_chat_transformation(
3255
+ question_text, mm_content
3256
+ )
3257
+ mm_for_request = None # Already embedded in chat content.
3258
+ else:
3259
+ # Default: prompt is plain text,
3260
+ # image is in mm_content for the bench to assemble.
3261
+ prompt = question_text
3262
+ mm_for_request = mm_content
3263
+
3264
+ sampled_requests.append(
3265
+ SampleRequest(
3266
+ prompt=prompt,
3267
+ prompt_len=prompt_len,
3268
+ expected_output_len=output_len,
3269
+ multi_modal_data=mm_for_request,
3270
+ request_id=request_id_prefix + str(ind),
3271
+ )
3272
+ )
3273
+
3274
+ self.maybe_oversample_requests(
3275
+ sampled_requests, num_requests, request_id_prefix, no_oversample
3276
+ )
3277
+ return sampled_requests