sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +89 -54
  3. sglang/bench_serving.py +437 -40
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/profiler.py +0 -1
  6. sglang/srt/configs/__init__.py +4 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/longcat_flash.py +104 -0
  9. sglang/srt/configs/model_config.py +37 -7
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +1 -1
  12. sglang/srt/connector/base_connector.py +1 -2
  13. sglang/srt/connector/redis.py +2 -2
  14. sglang/srt/connector/serde/__init__.py +1 -1
  15. sglang/srt/connector/serde/safe_serde.py +4 -3
  16. sglang/srt/custom_op.py +11 -1
  17. sglang/srt/debug_utils/dump_comparator.py +81 -44
  18. sglang/srt/debug_utils/dump_loader.py +97 -0
  19. sglang/srt/debug_utils/dumper.py +11 -3
  20. sglang/srt/debug_utils/text_comparator.py +73 -11
  21. sglang/srt/disaggregation/ascend/conn.py +75 -0
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +6 -4
  25. sglang/srt/disaggregation/fake/conn.py +1 -1
  26. sglang/srt/disaggregation/mini_lb.py +6 -420
  27. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  28. sglang/srt/disaggregation/nixl/conn.py +180 -16
  29. sglang/srt/disaggregation/prefill.py +6 -4
  30. sglang/srt/disaggregation/utils.py +5 -50
  31. sglang/srt/distributed/parallel_state.py +94 -58
  32. sglang/srt/entrypoints/engine.py +34 -14
  33. sglang/srt/entrypoints/http_server.py +172 -47
  34. sglang/srt/entrypoints/openai/protocol.py +90 -27
  35. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  36. sglang/srt/entrypoints/openai/serving_chat.py +82 -26
  37. sglang/srt/entrypoints/openai/serving_completions.py +25 -4
  38. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  39. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  40. sglang/srt/eplb/eplb_manager.py +28 -4
  41. sglang/srt/eplb/expert_distribution.py +55 -15
  42. sglang/srt/eplb/expert_location.py +8 -3
  43. sglang/srt/eplb/expert_location_updater.py +1 -1
  44. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  45. sglang/srt/function_call/ebnf_composer.py +11 -9
  46. sglang/srt/function_call/function_call_parser.py +2 -0
  47. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  48. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  49. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  50. sglang/srt/hf_transformers_utils.py +28 -7
  51. sglang/srt/layers/activation.py +44 -9
  52. sglang/srt/layers/attention/aiter_backend.py +93 -68
  53. sglang/srt/layers/attention/ascend_backend.py +381 -136
  54. sglang/srt/layers/attention/fla/chunk.py +242 -0
  55. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  56. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  57. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  58. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  59. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  60. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  61. sglang/srt/layers/attention/fla/index.py +37 -0
  62. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  63. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  64. sglang/srt/layers/attention/fla/op.py +66 -0
  65. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  66. sglang/srt/layers/attention/fla/utils.py +331 -0
  67. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  68. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  69. sglang/srt/layers/attention/flashinfer_backend.py +11 -6
  70. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
  71. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  72. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  73. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  74. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  75. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  76. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  77. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  78. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  79. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  80. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  81. sglang/srt/layers/communicator.py +45 -8
  82. sglang/srt/layers/layernorm.py +54 -12
  83. sglang/srt/layers/logits_processor.py +10 -3
  84. sglang/srt/layers/moe/__init__.py +2 -1
  85. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  86. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  87. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  88. sglang/srt/layers/moe/ep_moe/layer.py +111 -56
  89. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  90. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  94. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  100. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  101. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  102. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  103. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  104. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  105. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  106. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  107. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  108. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  109. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  110. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  111. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  112. sglang/srt/layers/moe/topk.py +43 -12
  113. sglang/srt/layers/moe/utils.py +6 -5
  114. sglang/srt/layers/quantization/awq.py +19 -7
  115. sglang/srt/layers/quantization/base_config.py +11 -6
  116. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  119. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  121. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
  122. sglang/srt/layers/quantization/fp8.py +78 -48
  123. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  124. sglang/srt/layers/quantization/fp8_utils.py +45 -31
  125. sglang/srt/layers/quantization/gptq.py +25 -17
  126. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  127. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  128. sglang/srt/layers/quantization/mxfp4.py +93 -68
  129. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  130. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  131. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  132. sglang/srt/layers/quantization/quark/utils.py +97 -0
  133. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  134. sglang/srt/layers/quantization/unquant.py +135 -47
  135. sglang/srt/layers/quantization/utils.py +13 -0
  136. sglang/srt/layers/quantization/w4afp8.py +60 -42
  137. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  138. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  139. sglang/srt/layers/rocm_linear_utils.py +44 -0
  140. sglang/srt/layers/rotary_embedding.py +28 -19
  141. sglang/srt/layers/sampler.py +29 -5
  142. sglang/srt/layers/utils.py +0 -14
  143. sglang/srt/lora/backend/base_backend.py +50 -8
  144. sglang/srt/lora/backend/triton_backend.py +90 -2
  145. sglang/srt/lora/layers.py +32 -0
  146. sglang/srt/lora/lora.py +4 -1
  147. sglang/srt/lora/lora_manager.py +35 -112
  148. sglang/srt/lora/mem_pool.py +24 -10
  149. sglang/srt/lora/utils.py +18 -9
  150. sglang/srt/managers/cache_controller.py +396 -365
  151. sglang/srt/managers/data_parallel_controller.py +30 -15
  152. sglang/srt/managers/detokenizer_manager.py +18 -2
  153. sglang/srt/managers/disagg_service.py +46 -0
  154. sglang/srt/managers/io_struct.py +190 -11
  155. sglang/srt/managers/mm_utils.py +6 -1
  156. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  157. sglang/srt/managers/schedule_batch.py +27 -44
  158. sglang/srt/managers/schedule_policy.py +4 -3
  159. sglang/srt/managers/scheduler.py +148 -122
  160. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  161. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  162. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  163. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  164. sglang/srt/managers/template_manager.py +3 -3
  165. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  166. sglang/srt/managers/tokenizer_manager.py +77 -480
  167. sglang/srt/managers/tp_worker.py +16 -4
  168. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  169. sglang/srt/mem_cache/allocator.py +1 -1
  170. sglang/srt/mem_cache/chunk_cache.py +1 -1
  171. sglang/srt/mem_cache/hicache_storage.py +53 -40
  172. sglang/srt/mem_cache/hiradix_cache.py +196 -104
  173. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  174. sglang/srt/mem_cache/memory_pool.py +395 -53
  175. sglang/srt/mem_cache/memory_pool_host.py +27 -19
  176. sglang/srt/mem_cache/radix_cache.py +6 -6
  177. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  178. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  179. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  180. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  181. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
  182. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  183. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  184. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
  185. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  186. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  187. sglang/srt/metrics/collector.py +484 -63
  188. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  189. sglang/srt/metrics/utils.py +48 -0
  190. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  191. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  192. sglang/srt/model_executor/forward_batch_info.py +72 -18
  193. sglang/srt/model_executor/model_runner.py +190 -32
  194. sglang/srt/model_loader/__init__.py +9 -3
  195. sglang/srt/model_loader/loader.py +33 -28
  196. sglang/srt/model_loader/utils.py +12 -0
  197. sglang/srt/model_loader/weight_utils.py +2 -1
  198. sglang/srt/models/deepseek_v2.py +323 -53
  199. sglang/srt/models/gemma3n_mm.py +1 -1
  200. sglang/srt/models/glm4_moe.py +10 -1
  201. sglang/srt/models/glm4v.py +4 -2
  202. sglang/srt/models/gpt_oss.py +7 -19
  203. sglang/srt/models/internvl.py +28 -0
  204. sglang/srt/models/llama4.py +9 -0
  205. sglang/srt/models/llama_eagle3.py +17 -0
  206. sglang/srt/models/longcat_flash.py +1026 -0
  207. sglang/srt/models/longcat_flash_nextn.py +699 -0
  208. sglang/srt/models/minicpmv.py +165 -3
  209. sglang/srt/models/mllama4.py +25 -0
  210. sglang/srt/models/opt.py +637 -0
  211. sglang/srt/models/qwen2.py +33 -3
  212. sglang/srt/models/qwen2_5_vl.py +91 -42
  213. sglang/srt/models/qwen2_moe.py +79 -14
  214. sglang/srt/models/qwen3.py +8 -2
  215. sglang/srt/models/qwen3_moe.py +39 -8
  216. sglang/srt/models/qwen3_next.py +1039 -0
  217. sglang/srt/models/qwen3_next_mtp.py +109 -0
  218. sglang/srt/models/torch_native_llama.py +1 -1
  219. sglang/srt/models/transformers.py +1 -1
  220. sglang/srt/multimodal/processors/base_processor.py +4 -2
  221. sglang/srt/multimodal/processors/glm4v.py +9 -9
  222. sglang/srt/multimodal/processors/internvl.py +141 -129
  223. sglang/srt/{conversation.py → parser/conversation.py} +38 -5
  224. sglang/srt/parser/harmony_parser.py +588 -0
  225. sglang/srt/parser/reasoning_parser.py +309 -0
  226. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  227. sglang/srt/sampling/sampling_batch_info.py +18 -15
  228. sglang/srt/server_args.py +307 -80
  229. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  230. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  231. sglang/srt/speculative/eagle_worker.py +216 -120
  232. sglang/srt/speculative/spec_info.py +5 -0
  233. sglang/srt/speculative/standalone_worker.py +109 -0
  234. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  235. sglang/srt/utils.py +96 -7
  236. sglang/srt/weight_sync/utils.py +1 -1
  237. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  238. sglang/test/few_shot_gsm8k.py +1 -0
  239. sglang/test/runners.py +4 -0
  240. sglang/test/test_cutlass_moe.py +24 -6
  241. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  242. sglang/test/test_disaggregation_utils.py +66 -0
  243. sglang/test/test_utils.py +25 -1
  244. sglang/utils.py +5 -0
  245. sglang/version.py +1 -1
  246. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
  247. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
  248. sglang/srt/disaggregation/launch_lb.py +0 -131
  249. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  250. sglang/srt/reasoning_parser.py +0 -553
  251. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  252. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  253. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  254. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  255. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  256. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -61,6 +61,7 @@ from sglang.srt.configs.model_config import ModelConfig
61
61
  from sglang.srt.distributed.parallel_state import destroy_distributed_environment
62
62
  from sglang.srt.entrypoints.engine import _set_envs_and_config
63
63
  from sglang.srt.hf_transformers_utils import get_tokenizer
64
+ from sglang.srt.layers.moe import initialize_moe_config
64
65
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
65
66
  from sglang.srt.managers.scheduler import Scheduler
66
67
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -509,6 +510,8 @@ def latency_test(
509
510
  bench_args,
510
511
  tp_rank,
511
512
  ):
513
+ initialize_moe_config(server_args)
514
+
512
515
  # Set CPU affinity
513
516
  if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
514
517
  set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
@@ -18,7 +18,7 @@ import json
18
18
  import multiprocessing
19
19
  import os
20
20
  import time
21
- from typing import Tuple
21
+ from typing import List, Tuple
22
22
 
23
23
  import requests
24
24
 
@@ -45,7 +45,9 @@ class BenchArgs:
45
45
  skip_warmup: bool = False
46
46
  show_report: bool = False
47
47
  profile: bool = False
48
+ profile_steps: int = 3
48
49
  profile_by_stage: bool = False
50
+ dataset_path: str = ""
49
51
 
50
52
  @staticmethod
51
53
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -78,7 +80,16 @@ class BenchArgs:
78
80
  parser.add_argument("--skip-warmup", action="store_true")
79
81
  parser.add_argument("--show-report", action="store_true")
80
82
  parser.add_argument("--profile", action="store_true")
83
+ parser.add_argument(
84
+ "--profile-steps", type=int, default=BenchArgs.profile_steps
85
+ )
81
86
  parser.add_argument("--profile-by-stage", action="store_true")
87
+ parser.add_argument(
88
+ "--dataset-path",
89
+ type=str,
90
+ default=BenchArgs.dataset_path,
91
+ help="Path to the dataset.",
92
+ )
82
93
 
83
94
  @classmethod
84
95
  def from_cli_args(cls, args: argparse.Namespace):
@@ -132,7 +143,9 @@ def run_one_case(
132
143
  result_filename: str,
133
144
  tokenizer,
134
145
  profile: bool = False,
146
+ profile_steps: int = 3,
135
147
  profile_by_stage: bool = False,
148
+ dataset_path: str = "",
136
149
  ):
137
150
  requests.post(url + "/flush_cache")
138
151
  input_requests = sample_random_requests(
@@ -141,7 +154,7 @@ def run_one_case(
141
154
  num_prompts=batch_size,
142
155
  range_ratio=1.0,
143
156
  tokenizer=tokenizer,
144
- dataset_path="",
157
+ dataset_path=dataset_path,
145
158
  random_sample=True,
146
159
  return_text=False,
147
160
  )
@@ -162,7 +175,7 @@ def run_one_case(
162
175
  profile_link = None
163
176
  if profile:
164
177
  profile_link: str = run_profile(
165
- url, 3, ["CPU", "GPU"], None, None, profile_by_stage
178
+ url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
166
179
  )
167
180
 
168
181
  tic = time.perf_counter()
@@ -247,6 +260,71 @@ def run_one_case(
247
260
  )
248
261
 
249
262
 
263
+ def get_report_summary(
264
+ result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
265
+ ):
266
+ import tabulate
267
+
268
+ summary = (
269
+ f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
270
+ )
271
+
272
+ headers = [
273
+ "batch size",
274
+ "latency (s)",
275
+ "input throughput (tok/s)",
276
+ "output throughput (tok/s)",
277
+ "acc length",
278
+ "ITL (ms)",
279
+ "input cost ($/1M)",
280
+ "output cost ($/1M)",
281
+ ]
282
+ if bench_args.profile:
283
+ headers.append("profile")
284
+ rows = []
285
+
286
+ for (
287
+ batch_size,
288
+ latency,
289
+ ttft,
290
+ input_throughput,
291
+ output_throughput,
292
+ _,
293
+ _,
294
+ acc_length,
295
+ trace_link,
296
+ ) in result:
297
+ if is_blackwell():
298
+ hourly_cost_per_gpu = 4 # $4/hour for one B200
299
+ else:
300
+ hourly_cost_per_gpu = 2 # $2/hour for one H100
301
+
302
+ hourly_cost = hourly_cost_per_gpu * server_args.tp_size
303
+ input_util = 0.7
304
+ accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
305
+ itl = 1 / (output_throughput / batch_size) * 1000
306
+ input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
307
+ output_cost = 1e6 / output_throughput / 3600 * hourly_cost
308
+ row = [
309
+ batch_size,
310
+ latency,
311
+ input_throughput,
312
+ output_throughput,
313
+ accept_length,
314
+ itl,
315
+ input_cost,
316
+ output_cost,
317
+ ]
318
+ if trace_link:
319
+ row.append(f"[Profile]({trace_link})")
320
+ rows.append(row)
321
+
322
+ summary += tabulate.tabulate(
323
+ rows, headers=headers, tablefmt="github", floatfmt=".2f"
324
+ )
325
+ return summary
326
+
327
+
250
328
  def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
251
329
  if bench_args.base_url:
252
330
  proc, base_url = None, bench_args.base_url
@@ -275,6 +353,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
275
353
  run_name="",
276
354
  result_filename="",
277
355
  tokenizer=tokenizer,
356
+ dataset_path=bench_args.dataset_path,
278
357
  )
279
358
  print("=" * 8 + " Warmup End " + "=" * 8 + "\n")
280
359
 
@@ -321,6 +400,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
321
400
  result_filename=bench_args.result_filename,
322
401
  tokenizer=tokenizer,
323
402
  profile=bench_args.profile,
403
+ profile_steps=bench_args.profile_steps,
324
404
  profile_by_stage=bench_args.profile_by_stage,
325
405
  )[-1],
326
406
  )
@@ -337,63 +417,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
337
417
  if not bench_args.show_report:
338
418
  return
339
419
 
340
- summary = (
341
- f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
342
- )
343
- summary += "| batch size | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
344
-
345
- if bench_args.profile:
346
- summary += " profile |"
347
-
348
- summary += "\n"
349
- summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
350
-
351
- if bench_args.profile:
352
- summary += "-------------|"
353
- summary += "\n"
354
-
355
- for (
356
- batch_size,
357
- latency,
358
- ttft,
359
- input_throughput,
360
- output_throughput,
361
- overall_throughput,
362
- last_gen_throughput,
363
- acc_length,
364
- trace_link,
365
- ) in result:
366
- if is_blackwell():
367
- hourly_cost_per_gpu = 4 # $4/hour for one B200
368
- else:
369
- hourly_cost_per_gpu = 2 # $2/hour for one H100
370
-
371
- hourly_cost = hourly_cost_per_gpu * server_args.tp_size
372
- input_util = 0.7
373
- accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
374
- line = (
375
- f"| {batch_size} | "
376
- f"{latency:.2f} | "
377
- f"{input_throughput:.2f} | "
378
- f"{output_throughput:.2f} | "
379
- f"{accept_length} | "
380
- f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
381
- f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
382
- f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
383
- )
384
- if trace_link:
385
- line += f" [Profile]({trace_link}) |"
386
- line += "\n"
387
- summary += line
388
-
389
- # print metrics table
420
+ summary = get_report_summary(result, server_args, bench_args)
390
421
  print(summary)
391
422
 
392
423
  if is_in_ci():
393
424
  write_github_step_summary(summary)
394
425
 
395
426
 
396
- if __name__ == "__main__":
427
+ def main():
397
428
  parser = argparse.ArgumentParser()
398
429
  ServerArgs.add_cli_args(parser)
399
430
  BenchArgs.add_cli_args(parser)
@@ -402,3 +433,7 @@ if __name__ == "__main__":
402
433
  bench_args = BenchArgs.from_cli_args(args)
403
434
 
404
435
  run_benchmark(server_args, bench_args)
436
+
437
+
438
+ if __name__ == "__main__":
439
+ main()