sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
@@ -60,7 +60,6 @@ import torch.distributed as dist
|
|
60
60
|
from sglang.srt.configs.model_config import ModelConfig
|
61
61
|
from sglang.srt.distributed.parallel_state import destroy_distributed_environment
|
62
62
|
from sglang.srt.entrypoints.engine import _set_envs_and_config
|
63
|
-
from sglang.srt.hf_transformers_utils import get_tokenizer
|
64
63
|
from sglang.srt.layers.moe import initialize_moe_config
|
65
64
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
66
65
|
from sglang.srt.managers.scheduler import Scheduler
|
@@ -78,6 +77,7 @@ from sglang.srt.utils import (
|
|
78
77
|
set_gpu_proc_affinity,
|
79
78
|
suppress_other_loggers,
|
80
79
|
)
|
80
|
+
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
81
81
|
|
82
82
|
|
83
83
|
@dataclasses.dataclass
|
@@ -443,11 +443,9 @@ def latency_test_run_once(
|
|
443
443
|
|
444
444
|
if profile:
|
445
445
|
profiler.stop()
|
446
|
-
|
447
|
-
_save_profile_trace_results(profiler,
|
448
|
-
rank_print(
|
449
|
-
f"torch profiler chrome trace for prefill saved to {profile_filename}"
|
450
|
-
)
|
446
|
+
trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
|
447
|
+
_save_profile_trace_results(profiler, trace_filename)
|
448
|
+
rank_print(f"torch profiler chrome trace for prefill saved to {trace_filename}")
|
451
449
|
|
452
450
|
# Decode
|
453
451
|
decode_latencies = []
|
@@ -479,10 +477,10 @@ def latency_test_run_once(
|
|
479
477
|
|
480
478
|
if profile and i == output_len / 2:
|
481
479
|
profiler.stop()
|
482
|
-
|
483
|
-
_save_profile_trace_results(profiler,
|
480
|
+
trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
|
481
|
+
_save_profile_trace_results(profiler, trace_filename)
|
484
482
|
rank_print(
|
485
|
-
f"torch profiler chrome trace for decoding 1 token saved to {
|
483
|
+
f"torch profiler chrome trace for decoding 1 token saved to {trace_filename}"
|
486
484
|
)
|
487
485
|
|
488
486
|
# Record decode timing from 2nd output
|
sglang/bench_one_batch_server.py
CHANGED
@@ -9,6 +9,7 @@ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --
|
|
9
9
|
|
10
10
|
python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
|
11
11
|
python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
|
12
|
+
python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --output-path results.json --profile
|
12
13
|
"""
|
13
14
|
|
14
15
|
import argparse
|
@@ -17,12 +18,19 @@ import itertools
|
|
17
18
|
import json
|
18
19
|
import multiprocessing
|
19
20
|
import os
|
21
|
+
import random
|
20
22
|
import time
|
21
|
-
from typing import List, Tuple
|
23
|
+
from typing import List, Optional, Tuple
|
22
24
|
|
25
|
+
import numpy as np
|
23
26
|
import requests
|
27
|
+
from pydantic import BaseModel
|
24
28
|
|
25
|
-
from sglang.bench_serving import
|
29
|
+
from sglang.bench_serving import (
|
30
|
+
get_tokenizer,
|
31
|
+
sample_mmmu_requests,
|
32
|
+
sample_random_requests,
|
33
|
+
)
|
26
34
|
from sglang.profiler import run_profile
|
27
35
|
from sglang.srt.entrypoints.http_server import launch_server
|
28
36
|
from sglang.srt.server_args import ServerArgs
|
@@ -30,9 +38,112 @@ from sglang.srt.utils import is_blackwell, kill_process_tree
|
|
30
38
|
from sglang.test.test_utils import is_in_ci, write_github_step_summary
|
31
39
|
|
32
40
|
|
41
|
+
class ProfileLinks(BaseModel):
|
42
|
+
"""Pydantic model for profile trace links."""
|
43
|
+
|
44
|
+
extend: Optional[str] = None
|
45
|
+
decode: Optional[str] = None
|
46
|
+
|
47
|
+
|
48
|
+
class BenchmarkResult(BaseModel):
|
49
|
+
"""Pydantic model for benchmark results table data, for a single isl and osl"""
|
50
|
+
|
51
|
+
model_path: str
|
52
|
+
run_name: str
|
53
|
+
batch_size: int
|
54
|
+
input_len: int
|
55
|
+
output_len: int
|
56
|
+
latency: float
|
57
|
+
ttft: float
|
58
|
+
input_throughput: float
|
59
|
+
output_throughput: float
|
60
|
+
overall_throughput: float
|
61
|
+
last_gen_throughput: float
|
62
|
+
acc_length: Optional[float] = None
|
63
|
+
profile_links: Optional[ProfileLinks] = None
|
64
|
+
|
65
|
+
@staticmethod
|
66
|
+
def help_str() -> str:
|
67
|
+
return f"""
|
68
|
+
Note: To view the traces through perfetto-ui, please:
|
69
|
+
1. open with Google Chrome
|
70
|
+
2. allow popup
|
71
|
+
"""
|
72
|
+
|
73
|
+
def to_markdown_row(
|
74
|
+
self, trace_dir, base_url: str = "", relay_base: str = ""
|
75
|
+
) -> str:
|
76
|
+
"""Convert this benchmark result to a markdown table row."""
|
77
|
+
# Calculate costs (assuming H100 pricing for now)
|
78
|
+
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
79
|
+
hourly_cost = hourly_cost_per_gpu * 1 # Assuming tp_size = 1 for simplicity
|
80
|
+
input_util = 0.7
|
81
|
+
accept_length = (
|
82
|
+
round(self.acc_length, 2) if self.acc_length is not None else "n/a"
|
83
|
+
)
|
84
|
+
itl = 1 / (self.output_throughput / self.batch_size) * 1000
|
85
|
+
input_cost = 1e6 / (self.input_throughput * input_util) / 3600 * hourly_cost
|
86
|
+
output_cost = 1e6 / self.output_throughput / 3600 * hourly_cost
|
87
|
+
|
88
|
+
def get_perfetto_relay_link_from_trace_file(trace_file: str):
|
89
|
+
import os
|
90
|
+
from urllib.parse import quote
|
91
|
+
|
92
|
+
rel_path = os.path.relpath(trace_file, trace_dir)
|
93
|
+
raw_file_link = f"{base_url}/{rel_path}"
|
94
|
+
relay_link = (
|
95
|
+
f"{relay_base}?src={quote(raw_file_link, safe='')}"
|
96
|
+
if relay_base and quote
|
97
|
+
else raw_file_link
|
98
|
+
)
|
99
|
+
return relay_link
|
100
|
+
|
101
|
+
# Handle profile links
|
102
|
+
profile_link = "NA | NA"
|
103
|
+
if self.profile_links:
|
104
|
+
if self.profile_links.extend or self.profile_links.decode:
|
105
|
+
# Create a combined link or use the first available one
|
106
|
+
trace_files = [self.profile_links.extend, self.profile_links.decode]
|
107
|
+
trace_files_relay_links = [
|
108
|
+
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
|
109
|
+
for trace_file in trace_files
|
110
|
+
]
|
111
|
+
|
112
|
+
profile_link = " | ".join(trace_files_relay_links)
|
113
|
+
|
114
|
+
# Build the row
|
115
|
+
return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
|
116
|
+
|
117
|
+
@classmethod
|
118
|
+
def generate_markdown_report(
|
119
|
+
cls, trace_dir, results: List["BenchmarkResult"]
|
120
|
+
) -> str:
|
121
|
+
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
|
122
|
+
import os
|
123
|
+
|
124
|
+
summary = f"### {results[0].model_path}\n"
|
125
|
+
|
126
|
+
# summary += (
|
127
|
+
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
|
128
|
+
# )
|
129
|
+
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
|
130
|
+
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
|
131
|
+
|
132
|
+
# all results should share the same isl & osl
|
133
|
+
for result in results:
|
134
|
+
base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
|
135
|
+
relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
|
136
|
+
relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
|
137
|
+
# base_url = "https://github.com/sgl-project/ci-data/traces"
|
138
|
+
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
|
139
|
+
|
140
|
+
return summary
|
141
|
+
|
142
|
+
|
33
143
|
@dataclasses.dataclass
|
34
144
|
class BenchArgs:
|
35
145
|
run_name: str = "default"
|
146
|
+
seed: int = 42
|
36
147
|
batch_size: Tuple[int] = (1,)
|
37
148
|
input_len: Tuple[int] = (1024,)
|
38
149
|
output_len: Tuple[int] = (16,)
|
@@ -47,10 +158,17 @@ class BenchArgs:
|
|
47
158
|
profile: bool = False
|
48
159
|
profile_steps: int = 3
|
49
160
|
profile_by_stage: bool = False
|
161
|
+
profile_filename_prefix: str = None
|
162
|
+
append_to_github_summary: bool = True
|
163
|
+
dataset_path: str = ""
|
164
|
+
parallel_batch: bool = False
|
165
|
+
dataset_name: str = "random"
|
166
|
+
output_path: Optional[str] = None
|
50
167
|
|
51
168
|
@staticmethod
|
52
169
|
def add_cli_args(parser: argparse.ArgumentParser):
|
53
170
|
parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
|
171
|
+
parser.add_argument("--seed", type=int, default=BenchArgs.seed)
|
54
172
|
parser.add_argument(
|
55
173
|
"--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
|
56
174
|
)
|
@@ -61,6 +179,13 @@ class BenchArgs:
|
|
61
179
|
"--output-len", type=int, nargs="+", default=BenchArgs.output_len
|
62
180
|
)
|
63
181
|
parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
|
182
|
+
parser.add_argument(
|
183
|
+
"--dataset-name",
|
184
|
+
type=str,
|
185
|
+
default=BenchArgs.dataset_name,
|
186
|
+
choices=["mmmu", "random"],
|
187
|
+
help="Name of the dataset to benchmark on.",
|
188
|
+
)
|
64
189
|
parser.add_argument("--return-logprob", action="store_true")
|
65
190
|
parser.add_argument(
|
66
191
|
"--client-stream-interval",
|
@@ -83,14 +208,43 @@ class BenchArgs:
|
|
83
208
|
"--profile-steps", type=int, default=BenchArgs.profile_steps
|
84
209
|
)
|
85
210
|
parser.add_argument("--profile-by-stage", action="store_true")
|
211
|
+
parser.add_argument(
|
212
|
+
"--dataset-path",
|
213
|
+
type=str,
|
214
|
+
default=BenchArgs.dataset_path,
|
215
|
+
help="Path to the dataset.",
|
216
|
+
)
|
217
|
+
parser.add_argument("--parallel-batch", action="store_true")
|
218
|
+
parser.add_argument(
|
219
|
+
"--profile-filename-prefix",
|
220
|
+
type=str,
|
221
|
+
default=BenchArgs.profile_filename_prefix,
|
222
|
+
)
|
223
|
+
parser.add_argument(
|
224
|
+
"--no-append-to-github-summary",
|
225
|
+
action="store_false",
|
226
|
+
dest="append_to_github_summary",
|
227
|
+
help="Disable appending the output of this run to github ci summary",
|
228
|
+
)
|
229
|
+
parser.add_argument(
|
230
|
+
"--output-path",
|
231
|
+
type=str,
|
232
|
+
default=BenchArgs.output_path,
|
233
|
+
help="Path to save benchmark results as JSON format. If not specified, results will only be saved to result-filename.",
|
234
|
+
)
|
86
235
|
|
87
236
|
@classmethod
|
88
237
|
def from_cli_args(cls, args: argparse.Namespace):
|
89
238
|
# use the default value's type to cast the args into correct types.
|
90
239
|
attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
|
91
|
-
|
92
|
-
|
93
|
-
|
240
|
+
kwargs = {}
|
241
|
+
for attr, attr_type in attrs:
|
242
|
+
val = getattr(args, attr)
|
243
|
+
if attr_type is type(None):
|
244
|
+
kwargs[attr] = val
|
245
|
+
else:
|
246
|
+
kwargs[attr] = attr_type(val)
|
247
|
+
return cls(**kwargs)
|
94
248
|
|
95
249
|
|
96
250
|
def launch_server_internal(server_args):
|
@@ -135,21 +289,35 @@ def run_one_case(
|
|
135
289
|
run_name: str,
|
136
290
|
result_filename: str,
|
137
291
|
tokenizer,
|
292
|
+
dataset_name="",
|
138
293
|
profile: bool = False,
|
139
294
|
profile_steps: int = 3,
|
140
295
|
profile_by_stage: bool = False,
|
296
|
+
profile_filename_prefix: str = None,
|
297
|
+
dataset_path: str = "",
|
298
|
+
parallel_batch: bool = False,
|
141
299
|
):
|
142
300
|
requests.post(url + "/flush_cache")
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
301
|
+
# TODO: reuse bench_serving.get_dataset ?
|
302
|
+
if dataset_name == "mmmu":
|
303
|
+
input_requests = sample_mmmu_requests(
|
304
|
+
num_requests=batch_size,
|
305
|
+
tokenizer=tokenizer,
|
306
|
+
fixed_output_len=output_len,
|
307
|
+
apply_chat_template=True,
|
308
|
+
random_sample=False,
|
309
|
+
)
|
310
|
+
elif dataset_name == "random":
|
311
|
+
input_requests = sample_random_requests(
|
312
|
+
input_len=input_len,
|
313
|
+
output_len=output_len,
|
314
|
+
num_prompts=batch_size,
|
315
|
+
range_ratio=1.0,
|
316
|
+
tokenizer=tokenizer,
|
317
|
+
dataset_path=dataset_path,
|
318
|
+
random_sample=True,
|
319
|
+
return_text=False,
|
320
|
+
)
|
153
321
|
|
154
322
|
use_structured_outputs = False
|
155
323
|
if use_structured_outputs:
|
@@ -166,25 +334,48 @@ def run_one_case(
|
|
166
334
|
|
167
335
|
profile_link = None
|
168
336
|
if profile:
|
337
|
+
output_dir, profile_name = None, None
|
338
|
+
if profile_filename_prefix:
|
339
|
+
output_dir = os.path.dirname(profile_filename_prefix)
|
340
|
+
profile_name = os.path.basename(profile_filename_prefix)
|
169
341
|
profile_link: str = run_profile(
|
170
|
-
url,
|
342
|
+
url,
|
343
|
+
profile_steps,
|
344
|
+
["CPU", "GPU"],
|
345
|
+
output_dir,
|
346
|
+
profile_name,
|
347
|
+
profile_by_stage,
|
171
348
|
)
|
172
349
|
|
173
350
|
tic = time.perf_counter()
|
351
|
+
|
352
|
+
payload = {
|
353
|
+
"sampling_params": {
|
354
|
+
"temperature": temperature,
|
355
|
+
"max_new_tokens": output_len,
|
356
|
+
"ignore_eos": True,
|
357
|
+
"json_schema": json_schema,
|
358
|
+
"stream_interval": stream_interval,
|
359
|
+
},
|
360
|
+
"return_logprob": return_logprob,
|
361
|
+
"stream": True,
|
362
|
+
**({"parallel_batch": parallel_batch} if parallel_batch else {}),
|
363
|
+
}
|
364
|
+
if dataset_name == "mmmu":
|
365
|
+
# vlm
|
366
|
+
input_ids = []
|
367
|
+
for input_req in input_requests:
|
368
|
+
input_ids += [tokenizer.encode(input_req.prompt)]
|
369
|
+
payload["image_data"] = [req.image_data for req in input_requests]
|
370
|
+
|
371
|
+
else:
|
372
|
+
input_ids = [req.prompt for req in input_requests]
|
373
|
+
|
374
|
+
payload["input_ids"] = input_ids
|
375
|
+
|
174
376
|
response = requests.post(
|
175
377
|
url + "/generate",
|
176
|
-
json=
|
177
|
-
"input_ids": [req.prompt for req in input_requests],
|
178
|
-
"sampling_params": {
|
179
|
-
"temperature": temperature,
|
180
|
-
"max_new_tokens": output_len,
|
181
|
-
"ignore_eos": True,
|
182
|
-
"json_schema": json_schema,
|
183
|
-
"stream_interval": stream_interval,
|
184
|
-
},
|
185
|
-
"return_logprob": return_logprob,
|
186
|
-
"stream": True,
|
187
|
-
},
|
378
|
+
json=payload,
|
188
379
|
stream=True,
|
189
380
|
)
|
190
381
|
|
@@ -248,10 +439,100 @@ def run_one_case(
|
|
248
439
|
overall_throughput,
|
249
440
|
last_gen_throughput,
|
250
441
|
acc_length,
|
251
|
-
profile_link
|
442
|
+
profile_link,
|
252
443
|
)
|
253
444
|
|
254
445
|
|
446
|
+
def save_results_as_json(result: List[Tuple], bench_args: BenchArgs, model: str):
|
447
|
+
"""Save benchmark results as JSON using Pydantic models."""
|
448
|
+
json_results = []
|
449
|
+
|
450
|
+
# Generate all parameter combinations to match with results
|
451
|
+
param_combinations = list(
|
452
|
+
itertools.product(
|
453
|
+
bench_args.batch_size, bench_args.input_len, bench_args.output_len
|
454
|
+
)
|
455
|
+
)
|
456
|
+
|
457
|
+
for i, (
|
458
|
+
batch_size,
|
459
|
+
latency,
|
460
|
+
ttft,
|
461
|
+
input_throughput,
|
462
|
+
output_throughput,
|
463
|
+
overall_throughput,
|
464
|
+
last_gen_throughput,
|
465
|
+
acc_length,
|
466
|
+
profile_link,
|
467
|
+
) in enumerate(result):
|
468
|
+
# Get the corresponding parameters for this result
|
469
|
+
bs, input_len, output_len = param_combinations[i]
|
470
|
+
|
471
|
+
# Parse profile links if available
|
472
|
+
profile_links = None
|
473
|
+
if profile_link:
|
474
|
+
profile_links = parse_profile_links(
|
475
|
+
profile_link, batch_size, input_len, output_len
|
476
|
+
)
|
477
|
+
|
478
|
+
benchmark_result = BenchmarkResult(
|
479
|
+
model_path=model,
|
480
|
+
run_name=bench_args.run_name,
|
481
|
+
batch_size=batch_size,
|
482
|
+
input_len=input_len,
|
483
|
+
output_len=output_len,
|
484
|
+
latency=latency,
|
485
|
+
ttft=ttft,
|
486
|
+
input_throughput=input_throughput,
|
487
|
+
output_throughput=output_throughput,
|
488
|
+
overall_throughput=overall_throughput,
|
489
|
+
last_gen_throughput=last_gen_throughput,
|
490
|
+
acc_length=acc_length,
|
491
|
+
profile_links=profile_links,
|
492
|
+
)
|
493
|
+
json_results.append(benchmark_result.model_dump())
|
494
|
+
|
495
|
+
# Save to JSON file
|
496
|
+
with open(bench_args.output_path, "w", encoding="utf-8") as f:
|
497
|
+
json.dump(json_results, f, indent=2, ensure_ascii=False)
|
498
|
+
|
499
|
+
print(f"Results saved as JSON to {bench_args.output_path}")
|
500
|
+
|
501
|
+
|
502
|
+
def parse_profile_links(
|
503
|
+
profile_dir: str, batch_size: int, input_len: int, output_len: int
|
504
|
+
) -> Optional[ProfileLinks]:
|
505
|
+
"""Parse profile directory to extract extend and decode trace file links."""
|
506
|
+
if not profile_dir or not os.path.exists(profile_dir):
|
507
|
+
return None
|
508
|
+
|
509
|
+
extend_link = None
|
510
|
+
decode_link = None
|
511
|
+
|
512
|
+
# Look for extend/prefill trace files
|
513
|
+
for file in os.listdir(profile_dir):
|
514
|
+
if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
|
515
|
+
if "extend" in file.lower() or "prefill" in file.lower():
|
516
|
+
extend_link = os.path.join(profile_dir, file)
|
517
|
+
elif "decode" in file.lower():
|
518
|
+
decode_link = os.path.join(profile_dir, file)
|
519
|
+
|
520
|
+
# If no specific extend/decode files found, try to find files with batch/input/output info
|
521
|
+
if not extend_link or not decode_link:
|
522
|
+
for file in os.listdir(profile_dir):
|
523
|
+
if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
|
524
|
+
if f"_batch{batch_size}_input{input_len}_output{output_len}_" in file:
|
525
|
+
if "prefill" in file.lower() or "extend" in file.lower():
|
526
|
+
extend_link = os.path.join(profile_dir, file)
|
527
|
+
elif "decode" in file.lower():
|
528
|
+
decode_link = os.path.join(profile_dir, file)
|
529
|
+
|
530
|
+
if extend_link or decode_link:
|
531
|
+
return ProfileLinks(extend=extend_link, decode=decode_link)
|
532
|
+
|
533
|
+
return None
|
534
|
+
|
535
|
+
|
255
536
|
def get_report_summary(
|
256
537
|
result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
|
257
538
|
):
|
@@ -342,9 +623,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
342
623
|
return_logprob=bench_args.return_logprob,
|
343
624
|
stream_interval=bench_args.client_stream_interval,
|
344
625
|
input_len_step_percentage=bench_args.input_len_step_percentage,
|
626
|
+
dataset_name=bench_args.dataset_name,
|
345
627
|
run_name="",
|
346
628
|
result_filename="",
|
347
629
|
tokenizer=tokenizer,
|
630
|
+
dataset_path=bench_args.dataset_path,
|
631
|
+
parallel_batch=bench_args.parallel_batch,
|
348
632
|
)
|
349
633
|
print("=" * 8 + " Warmup End " + "=" * 8 + "\n")
|
350
634
|
|
@@ -366,8 +650,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
366
650
|
stream_interval=bench_args.client_stream_interval,
|
367
651
|
input_len_step_percentage=bench_args.input_len_step_percentage,
|
368
652
|
run_name=bench_args.run_name,
|
653
|
+
dataset_name=bench_args.dataset_name,
|
369
654
|
result_filename=bench_args.result_filename,
|
370
655
|
tokenizer=tokenizer,
|
656
|
+
dataset_path=bench_args.dataset_path,
|
657
|
+
parallel_batch=bench_args.parallel_batch,
|
658
|
+
profile_filename_prefix=bench_args.profile_filename_prefix,
|
371
659
|
)
|
372
660
|
)
|
373
661
|
|
@@ -390,9 +678,13 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
390
678
|
run_name=bench_args.run_name,
|
391
679
|
result_filename=bench_args.result_filename,
|
392
680
|
tokenizer=tokenizer,
|
681
|
+
dataset_name=bench_args.dataset_name,
|
393
682
|
profile=bench_args.profile,
|
394
683
|
profile_steps=bench_args.profile_steps,
|
395
684
|
profile_by_stage=bench_args.profile_by_stage,
|
685
|
+
dataset_path=bench_args.dataset_path,
|
686
|
+
parallel_batch=bench_args.parallel_batch,
|
687
|
+
profile_filename_prefix=bench_args.profile_filename_prefix,
|
396
688
|
)[-1],
|
397
689
|
)
|
398
690
|
)
|
@@ -405,13 +697,16 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
405
697
|
|
406
698
|
print(f"\nResults are saved to {bench_args.result_filename}")
|
407
699
|
|
700
|
+
# Save results as JSON if output_path is specified
|
701
|
+
if bench_args.output_path:
|
702
|
+
save_results_as_json(result, bench_args, model=server_args.model_path)
|
703
|
+
|
408
704
|
if not bench_args.show_report:
|
409
705
|
return
|
410
706
|
|
411
707
|
summary = get_report_summary(result, server_args, bench_args)
|
412
|
-
print(summary)
|
413
708
|
|
414
|
-
if is_in_ci():
|
709
|
+
if is_in_ci() and bench_args.append_to_github_summary:
|
415
710
|
write_github_step_summary(summary)
|
416
711
|
|
417
712
|
|
@@ -420,6 +715,10 @@ def main():
|
|
420
715
|
ServerArgs.add_cli_args(parser)
|
421
716
|
BenchArgs.add_cli_args(parser)
|
422
717
|
args = parser.parse_args()
|
718
|
+
|
719
|
+
random.seed(args.seed)
|
720
|
+
np.random.seed(args.seed)
|
721
|
+
|
423
722
|
server_args = ServerArgs.from_cli_args(args)
|
424
723
|
bench_args = BenchArgs.from_cli_args(args)
|
425
724
|
|