sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
@@ -75,6 +75,7 @@ class RequestFuncInput:
|
|
75
75
|
lora_name: str
|
76
76
|
image_data: Optional[List[str]]
|
77
77
|
extra_request_body: Dict[str, Any]
|
78
|
+
timestamp: Optional[float] = None
|
78
79
|
|
79
80
|
|
80
81
|
@dataclass
|
@@ -104,10 +105,13 @@ def remove_suffix(text: str, suffix: str) -> str:
|
|
104
105
|
|
105
106
|
|
106
107
|
def get_auth_headers() -> Dict[str, str]:
|
107
|
-
|
108
|
-
if
|
109
|
-
return {"Authorization": f"Bearer {
|
108
|
+
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
109
|
+
if openai_api_key:
|
110
|
+
return {"Authorization": f"Bearer {openai_api_key}"}
|
110
111
|
else:
|
112
|
+
api_key = os.environ.get("API_KEY")
|
113
|
+
if api_key:
|
114
|
+
return {"Authorization": f"{api_key}"}
|
111
115
|
return {}
|
112
116
|
|
113
117
|
|
@@ -204,6 +208,10 @@ async def async_request_openai_completions(
|
|
204
208
|
"ignore_eos": not args.disable_ignore_eos,
|
205
209
|
**request_func_input.extra_request_body,
|
206
210
|
}
|
211
|
+
|
212
|
+
if request_func_input.image_data:
|
213
|
+
payload.update({"image_data": request_func_input.image_data})
|
214
|
+
|
207
215
|
headers = get_auth_headers()
|
208
216
|
|
209
217
|
output = RequestFuncOutput.init_new(request_func_input)
|
@@ -627,7 +635,7 @@ def get_tokenizer(
|
|
627
635
|
if pretrained_model_name_or_path.endswith(
|
628
636
|
".json"
|
629
637
|
) or pretrained_model_name_or_path.endswith(".model"):
|
630
|
-
from sglang.srt.hf_transformers_utils import get_tokenizer
|
638
|
+
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
631
639
|
|
632
640
|
return get_tokenizer(pretrained_model_name_or_path)
|
633
641
|
|
@@ -696,6 +704,24 @@ def get_dataset(args, tokenizer):
|
|
696
704
|
apply_chat_template=args.apply_chat_template,
|
697
705
|
random_sample=True,
|
698
706
|
)
|
707
|
+
elif args.dataset_name == "mooncake":
|
708
|
+
# For mooncake, we don't generate the prompts here.
|
709
|
+
# We just load the raw trace data. The async generator will handle the rest.
|
710
|
+
if not args.dataset_path:
|
711
|
+
local_path = os.path.join("/tmp", args.mooncake_workload + "_trace.jsonl")
|
712
|
+
else:
|
713
|
+
local_path = args.dataset_path
|
714
|
+
|
715
|
+
if not os.path.exists(local_path):
|
716
|
+
download_and_cache_file(
|
717
|
+
MOONCAKE_DATASET_URL[args.mooncake_workload], local_path
|
718
|
+
)
|
719
|
+
|
720
|
+
with open(local_path, "r") as f:
|
721
|
+
all_requests_data = [json.loads(line) for line in f if line.strip()]
|
722
|
+
|
723
|
+
# Limit the number of requests based on --num-prompts
|
724
|
+
input_requests = all_requests_data[: args.num_prompts]
|
699
725
|
else:
|
700
726
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
701
727
|
return input_requests
|
@@ -750,6 +776,12 @@ class BenchmarkMetrics:
|
|
750
776
|
|
751
777
|
|
752
778
|
SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
|
779
|
+
MOONCAKE_DATASET_URL = {
|
780
|
+
"mooncake": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl",
|
781
|
+
"conversation": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/conversation_trace.jsonl",
|
782
|
+
"synthetic": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/synthetic_trace.jsonl",
|
783
|
+
"toolagent": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/toolagent_trace.jsonl",
|
784
|
+
}
|
753
785
|
|
754
786
|
|
755
787
|
def download_and_cache_file(url: str, filename: Optional[str] = None):
|
@@ -808,6 +840,80 @@ class DatasetRow:
|
|
808
840
|
prompt_len: int
|
809
841
|
output_len: int
|
810
842
|
image_data: Optional[List[str]] = None
|
843
|
+
timestamp: Optional[float] = None
|
844
|
+
|
845
|
+
|
846
|
+
async def get_mooncake_request_over_time(
|
847
|
+
input_requests: List[Dict],
|
848
|
+
tokenizer: PreTrainedTokenizerBase,
|
849
|
+
slowdown_factor: float,
|
850
|
+
num_rounds: int,
|
851
|
+
) -> AsyncGenerator[DatasetRow, None]:
|
852
|
+
"""
|
853
|
+
An async generator that yields requests based on the timestamps in the Mooncake trace file,
|
854
|
+
with support for multi-round sessions.
|
855
|
+
"""
|
856
|
+
if not input_requests:
|
857
|
+
return
|
858
|
+
|
859
|
+
input_requests.sort(key=lambda r: r["timestamp"])
|
860
|
+
|
861
|
+
start_time = time.perf_counter()
|
862
|
+
trace_start_time_ms = input_requests[0]["timestamp"]
|
863
|
+
|
864
|
+
for record in input_requests:
|
865
|
+
# Calculate when this entire session should start
|
866
|
+
relative_arrival_time_s = (record["timestamp"] - trace_start_time_ms) / 1000.0
|
867
|
+
target_arrival_time_s = relative_arrival_time_s * slowdown_factor
|
868
|
+
|
869
|
+
current_elapsed_time_s = time.perf_counter() - start_time
|
870
|
+
sleep_duration_s = target_arrival_time_s - current_elapsed_time_s
|
871
|
+
if sleep_duration_s > 0:
|
872
|
+
await asyncio.sleep(sleep_duration_s)
|
873
|
+
|
874
|
+
# Once the session starts, generate all rounds for it as a burst
|
875
|
+
# This simulates a user engaging in a multi-turn conversation
|
876
|
+
|
877
|
+
# Base user query constructed from hash_ids
|
878
|
+
user_query_base = ""
|
879
|
+
hash_ids = record.get("hash_ids", [])
|
880
|
+
for hash_id in hash_ids:
|
881
|
+
user_query_base += f"{hash_id}" + " ".join(
|
882
|
+
["hi"] * 128
|
883
|
+
) # Shorter for multi-round
|
884
|
+
user_query_base += "Tell me a story based on this context."
|
885
|
+
|
886
|
+
output_len_per_round = record.get("output_length", 256)
|
887
|
+
chat_history = []
|
888
|
+
|
889
|
+
for i in range(num_rounds):
|
890
|
+
# Add user query for the current round
|
891
|
+
chat_history.append(
|
892
|
+
{"role": "user", "content": f"Round {i+1}: {user_query_base}"}
|
893
|
+
)
|
894
|
+
|
895
|
+
# Form the full prompt from history
|
896
|
+
try:
|
897
|
+
full_prompt_text = tokenizer.apply_chat_template(
|
898
|
+
chat_history, tokenize=False, add_generation_prompt=True
|
899
|
+
)
|
900
|
+
except Exception:
|
901
|
+
full_prompt_text = "\n".join(
|
902
|
+
[f"{msg['role']}: {msg['content']}" for msg in chat_history]
|
903
|
+
)
|
904
|
+
|
905
|
+
prompt_len = len(tokenizer.encode(full_prompt_text))
|
906
|
+
|
907
|
+
yield DatasetRow(
|
908
|
+
prompt=full_prompt_text,
|
909
|
+
prompt_len=prompt_len,
|
910
|
+
output_len=output_len_per_round,
|
911
|
+
)
|
912
|
+
|
913
|
+
# Add a placeholder assistant response for the next round's context
|
914
|
+
# We use a placeholder because we don't know the real response
|
915
|
+
placeholder_response = " ".join(["story"] * output_len_per_round)
|
916
|
+
chat_history.append({"role": "assistant", "content": placeholder_response})
|
811
917
|
|
812
918
|
|
813
919
|
def sample_mmmu_requests(
|
@@ -896,17 +1002,25 @@ def sample_mmmu_requests(
|
|
896
1002
|
prompt = f"Question: {question}\n\nAnswer: "
|
897
1003
|
if apply_chat_template:
|
898
1004
|
try:
|
1005
|
+
is_phi4_multimodal = (
|
1006
|
+
"phi-4-multimodal" in tokenizer.name_or_path.lower()
|
1007
|
+
)
|
1008
|
+
if is_phi4_multimodal:
|
1009
|
+
# <|endoftext10|> is the image token used in the phi-4-multimodal model.
|
1010
|
+
content = prompt.replace("image 1", "<|endoftext10|>")
|
1011
|
+
else:
|
1012
|
+
content = [
|
1013
|
+
{
|
1014
|
+
"type": "image_url",
|
1015
|
+
"image_url": {"url": image_data},
|
1016
|
+
},
|
1017
|
+
{"type": "text", "text": prompt},
|
1018
|
+
]
|
899
1019
|
prompt = tokenizer.apply_chat_template(
|
900
1020
|
[
|
901
1021
|
{
|
902
1022
|
"role": "user",
|
903
|
-
"content":
|
904
|
-
{
|
905
|
-
"type": "image_url",
|
906
|
-
"image_url": {"url": image_data},
|
907
|
-
},
|
908
|
-
{"type": "text", "text": prompt},
|
909
|
-
],
|
1023
|
+
"content": content,
|
910
1024
|
}
|
911
1025
|
],
|
912
1026
|
add_generation_prompt=True,
|
@@ -1000,7 +1114,8 @@ def sample_sharegpt_requests(
|
|
1000
1114
|
add_generation_prompt=True,
|
1001
1115
|
tokenize=False,
|
1002
1116
|
)
|
1003
|
-
|
1117
|
+
if tokenizer.bos_token:
|
1118
|
+
prompt = prompt.replace(tokenizer.bos_token, "")
|
1004
1119
|
|
1005
1120
|
prompt_token_ids = tokenizer.encode(prompt)
|
1006
1121
|
completion = dataset[i][1]
|
@@ -1359,19 +1474,41 @@ def sample_generated_shared_prefix_requests(
|
|
1359
1474
|
async def get_request(
|
1360
1475
|
input_requests: List[DatasetRow],
|
1361
1476
|
request_rate: float,
|
1477
|
+
use_trace_timestamps: bool = False,
|
1478
|
+
slowdown_factor: float = 1.0,
|
1362
1479
|
) -> AsyncGenerator[DatasetRow, None]:
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1480
|
+
if use_trace_timestamps:
|
1481
|
+
print(
|
1482
|
+
f"Using trace timestamps for request generation with slowdown factor {slowdown_factor}."
|
1483
|
+
)
|
1484
|
+
# Sort requests by timestamp for correct replay
|
1485
|
+
input_requests.sort(key=lambda r: r.timestamp)
|
1366
1486
|
|
1367
|
-
|
1368
|
-
|
1369
|
-
|
1487
|
+
start_time = time.perf_counter()
|
1488
|
+
trace_start_time_ms = input_requests[0].timestamp if input_requests else 0
|
1489
|
+
|
1490
|
+
for request in input_requests:
|
1491
|
+
trace_time_s = (request.timestamp - trace_start_time_ms) / 1000.0
|
1492
|
+
target_arrival_time = start_time + (trace_time_s * slowdown_factor)
|
1493
|
+
|
1494
|
+
sleep_duration = target_arrival_time - time.perf_counter()
|
1495
|
+
if sleep_duration > 0:
|
1496
|
+
await asyncio.sleep(sleep_duration)
|
1497
|
+
|
1498
|
+
yield request
|
1499
|
+
else:
|
1500
|
+
input_requests_iter = iter(input_requests)
|
1501
|
+
for request in input_requests_iter:
|
1502
|
+
yield request
|
1503
|
+
|
1504
|
+
if request_rate == float("inf"):
|
1505
|
+
# If the request rate is infinity, then we don't need to wait.
|
1506
|
+
continue
|
1370
1507
|
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1508
|
+
# Sample the request interval from the exponential distribution.
|
1509
|
+
interval = np.random.exponential(1.0 / request_rate)
|
1510
|
+
# The next request will be sent after the interval.
|
1511
|
+
await asyncio.sleep(interval)
|
1375
1512
|
|
1376
1513
|
|
1377
1514
|
def calculate_metrics(
|
@@ -1397,7 +1534,7 @@ def calculate_metrics(
|
|
1397
1534
|
tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
|
1398
1535
|
)
|
1399
1536
|
retokenized_output_lens.append(retokenized_output_len)
|
1400
|
-
total_input +=
|
1537
|
+
total_input += outputs[i].prompt_len
|
1401
1538
|
if output_len > 1:
|
1402
1539
|
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
1403
1540
|
itls += outputs[i].itl
|
@@ -1469,6 +1606,9 @@ async def benchmark(
|
|
1469
1606
|
pd_separated: bool = False,
|
1470
1607
|
flush_cache: bool = False,
|
1471
1608
|
warmup_requests: int = 1,
|
1609
|
+
use_trace_timestamps: bool = False,
|
1610
|
+
mooncake_slowdown_factor=1.0,
|
1611
|
+
mooncake_num_rounds=1,
|
1472
1612
|
):
|
1473
1613
|
if backend in ASYNC_REQUEST_FUNCS:
|
1474
1614
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -1488,8 +1628,32 @@ async def benchmark(
|
|
1488
1628
|
# Warmup
|
1489
1629
|
print(f"Starting warmup with {warmup_requests} sequences...")
|
1490
1630
|
|
1491
|
-
#
|
1492
|
-
|
1631
|
+
# Handle the data structure difference for the warmup request
|
1632
|
+
if args.dataset_name == "mooncake":
|
1633
|
+
# For mooncake, input_requests is a list of dicts.
|
1634
|
+
# We need to build a temporary DatasetRow for the warmup phase.
|
1635
|
+
warmup_record = input_requests[0]
|
1636
|
+
|
1637
|
+
# Build prompt from hash_ids, just like in the async generator
|
1638
|
+
hash_ids = warmup_record.get("hash_ids", [])
|
1639
|
+
prompt_text = ""
|
1640
|
+
for hash_id in hash_ids:
|
1641
|
+
prompt_text += f"{hash_id}" + " ".join(["hi"] * 512)
|
1642
|
+
prompt_text += "Can you tell me a detailed story in 1000 words?"
|
1643
|
+
|
1644
|
+
output_len = warmup_record.get("output_length", 32)
|
1645
|
+
prompt_len = len(tokenizer.encode(prompt_text))
|
1646
|
+
|
1647
|
+
# Create a temporary DatasetRow object for warmup
|
1648
|
+
test_request = DatasetRow(
|
1649
|
+
prompt=prompt_text,
|
1650
|
+
prompt_len=prompt_len,
|
1651
|
+
output_len=output_len,
|
1652
|
+
image_data=None, # Mooncake doesn't have image data
|
1653
|
+
)
|
1654
|
+
else:
|
1655
|
+
# For all other datasets, input_requests is a list of DatasetRow objects
|
1656
|
+
test_request = input_requests[0]
|
1493
1657
|
|
1494
1658
|
if lora_names is not None and len(lora_names) != 0:
|
1495
1659
|
lora_name = lora_names[0]
|
@@ -1543,12 +1707,26 @@ async def benchmark(
|
|
1543
1707
|
if profile_output.success:
|
1544
1708
|
print("Profiler started")
|
1545
1709
|
|
1546
|
-
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
1547
|
-
|
1548
1710
|
# Run all requests
|
1549
1711
|
benchmark_start_time = time.perf_counter()
|
1550
1712
|
tasks: List[asyncio.Task] = []
|
1551
|
-
|
1713
|
+
pbar_total = len(input_requests)
|
1714
|
+
if (
|
1715
|
+
backend == "sglang" and args.dataset_name == "mooncake"
|
1716
|
+
): # Assuming mooncake is mainly for sglang or similar backends
|
1717
|
+
print("Using time-based Mooncake request scheduler, ignoring --request-rate.")
|
1718
|
+
request_generator = get_mooncake_request_over_time(
|
1719
|
+
input_requests, tokenizer, mooncake_slowdown_factor, mooncake_num_rounds
|
1720
|
+
)
|
1721
|
+
print(
|
1722
|
+
f"Starting Mooncake trace replay. Sessions: {len(input_requests)}, Rounds per session: {mooncake_num_rounds}. Slowdown factor: {mooncake_slowdown_factor}"
|
1723
|
+
)
|
1724
|
+
pbar_total *= args.mooncake_num_rounds
|
1725
|
+
else:
|
1726
|
+
request_generator = get_request(input_requests, request_rate)
|
1727
|
+
|
1728
|
+
pbar = None if disable_tqdm else tqdm(total=pbar_total)
|
1729
|
+
async for request in request_generator:
|
1552
1730
|
if lora_names is not None and len(lora_names) != 0:
|
1553
1731
|
idx = random.randint(0, len(lora_names) - 1)
|
1554
1732
|
lora_name = lora_names[idx]
|
@@ -1564,6 +1742,7 @@ async def benchmark(
|
|
1564
1742
|
lora_name=lora_name,
|
1565
1743
|
image_data=request.image_data,
|
1566
1744
|
extra_request_body=extra_request_body,
|
1745
|
+
timestamp=request.timestamp,
|
1567
1746
|
)
|
1568
1747
|
|
1569
1748
|
tasks.append(
|
@@ -1584,7 +1763,9 @@ async def benchmark(
|
|
1584
1763
|
pbar.close()
|
1585
1764
|
|
1586
1765
|
if "sglang" in backend:
|
1587
|
-
server_info = requests.get(
|
1766
|
+
server_info = requests.get(
|
1767
|
+
base_url + "/get_server_info", headers=get_auth_headers()
|
1768
|
+
)
|
1588
1769
|
if server_info.status_code == 200:
|
1589
1770
|
server_info_json = server_info.json()
|
1590
1771
|
if "decode" in server_info_json:
|
@@ -1609,7 +1790,11 @@ async def benchmark(
|
|
1609
1790
|
|
1610
1791
|
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
1611
1792
|
print("{:<40} {:<10}".format("Backend:", backend))
|
1612
|
-
print(
|
1793
|
+
print(
|
1794
|
+
"{:<40} {:<10}".format(
|
1795
|
+
"Traffic request rate:", "trace" if use_trace_timestamps else request_rate
|
1796
|
+
)
|
1797
|
+
)
|
1613
1798
|
print(
|
1614
1799
|
"{:<40} {:<10}".format(
|
1615
1800
|
"Max request concurrency:",
|
@@ -1678,7 +1863,7 @@ async def benchmark(
|
|
1678
1863
|
# Arguments
|
1679
1864
|
"backend": args.backend,
|
1680
1865
|
"dataset_name": args.dataset_name,
|
1681
|
-
"request_rate": request_rate,
|
1866
|
+
"request_rate": "trace" if use_trace_timestamps else request_rate,
|
1682
1867
|
"max_concurrency": max_concurrency,
|
1683
1868
|
"sharegpt_output_len": args.sharegpt_output_len,
|
1684
1869
|
"random_input_len": args.random_input_len,
|
@@ -1731,7 +1916,9 @@ async def benchmark(
|
|
1731
1916
|
elif args.dataset_name.startswith("random"):
|
1732
1917
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
1733
1918
|
else:
|
1734
|
-
output_file_name =
|
1919
|
+
output_file_name = (
|
1920
|
+
f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
|
1921
|
+
)
|
1735
1922
|
|
1736
1923
|
result_details = {
|
1737
1924
|
"input_lens": [output.prompt_len for output in outputs],
|
@@ -1786,6 +1973,17 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1786
1973
|
if not hasattr(args, "tokenize_prompt"):
|
1787
1974
|
args.tokenize_prompt = False
|
1788
1975
|
|
1976
|
+
if not hasattr(args, "use_trace_timestamps"):
|
1977
|
+
args.use_trace_timestamps = False
|
1978
|
+
if not hasattr(args, "mooncake_slowdown_factor"):
|
1979
|
+
args.mooncake_slowdown_factor = 1.0
|
1980
|
+
|
1981
|
+
if not hasattr(args, "mooncake_slowdown_factor"):
|
1982
|
+
args.mooncake_slowdown_factor = 1.0
|
1983
|
+
|
1984
|
+
if not hasattr(args, "mooncake_num_rounds"):
|
1985
|
+
args.mooncake_num_rounds = 1
|
1986
|
+
|
1789
1987
|
print(f"benchmark_args={args}")
|
1790
1988
|
|
1791
1989
|
# Set global environments
|
@@ -1919,6 +2117,9 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1919
2117
|
pd_separated=args.pd_separated,
|
1920
2118
|
flush_cache=args.flush_cache,
|
1921
2119
|
warmup_requests=args.warmup_requests,
|
2120
|
+
use_trace_timestamps=args.use_trace_timestamps,
|
2121
|
+
mooncake_slowdown_factor=args.mooncake_slowdown_factor,
|
2122
|
+
mooncake_num_rounds=args.mooncake_num_rounds,
|
1922
2123
|
)
|
1923
2124
|
)
|
1924
2125
|
|
@@ -1975,6 +2176,7 @@ if __name__ == "__main__":
|
|
1975
2176
|
"generated-shared-prefix",
|
1976
2177
|
"mmmu",
|
1977
2178
|
"random-image",
|
2179
|
+
"mooncake",
|
1978
2180
|
],
|
1979
2181
|
help="Name of the dataset to benchmark on.",
|
1980
2182
|
)
|
@@ -2051,6 +2253,11 @@ if __name__ == "__main__":
|
|
2051
2253
|
help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
|
2052
2254
|
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
|
2053
2255
|
)
|
2256
|
+
parser.add_argument(
|
2257
|
+
"--use-trace-timestamps",
|
2258
|
+
action="store_true",
|
2259
|
+
help="Use timestamps from the trace file for request scheduling. Only valid for 'mooncake' dataset.",
|
2260
|
+
)
|
2054
2261
|
parser.add_argument(
|
2055
2262
|
"--max-concurrency",
|
2056
2263
|
type=int,
|
@@ -2174,5 +2381,33 @@ if __name__ == "__main__":
|
|
2174
2381
|
default=256,
|
2175
2382
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
2176
2383
|
)
|
2384
|
+
mooncake_group = parser.add_argument_group("mooncake dataset arguments")
|
2385
|
+
mooncake_group.add_argument(
|
2386
|
+
"--mooncake-slowdown-factor",
|
2387
|
+
type=float,
|
2388
|
+
default=1.0,
|
2389
|
+
help="Slowdown factor for replaying the mooncake trace. "
|
2390
|
+
"A value of 2.0 means the replay is twice as slow. "
|
2391
|
+
"NOTE: --request-rate is IGNORED in mooncake mode.",
|
2392
|
+
)
|
2393
|
+
mooncake_group.add_argument(
|
2394
|
+
"--mooncake-num-rounds",
|
2395
|
+
type=int,
|
2396
|
+
default=1,
|
2397
|
+
help="Number of conversation rounds for each session in the mooncake dataset. "
|
2398
|
+
"A value > 1 will enable true multi-turn session benchmarking.",
|
2399
|
+
)
|
2400
|
+
mooncake_group.add_argument(
|
2401
|
+
"--mooncake-workload",
|
2402
|
+
type=str,
|
2403
|
+
default="conversation",
|
2404
|
+
choices=[
|
2405
|
+
"mooncake",
|
2406
|
+
"conversation",
|
2407
|
+
"synthetic",
|
2408
|
+
"toolagent",
|
2409
|
+
],
|
2410
|
+
help="Underlying workload for the mooncake dataset.",
|
2411
|
+
)
|
2177
2412
|
args = parser.parse_args()
|
2178
2413
|
run_benchmark(args)
|
sglang/global_config.py
CHANGED
@@ -37,8 +37,8 @@ class GlobalConfig:
|
|
37
37
|
)
|
38
38
|
# Runtime constants: others
|
39
39
|
self.retract_decode_steps = 20
|
40
|
-
self.flashinfer_workspace_size =
|
41
|
-
"FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024
|
40
|
+
self.flashinfer_workspace_size = int(
|
41
|
+
os.environ.get("FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024)
|
42
42
|
)
|
43
43
|
|
44
44
|
# Output tokenization configs
|
@@ -433,7 +433,7 @@ class Runtime:
|
|
433
433
|
self.endpoint.cache_prefix(prefix)
|
434
434
|
|
435
435
|
def get_tokenizer(self):
|
436
|
-
from sglang.srt.hf_transformers_utils import get_tokenizer
|
436
|
+
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
437
437
|
|
438
438
|
return get_tokenizer(
|
439
439
|
self.server_args.tokenizer_path,
|
sglang/launch_server.py
CHANGED
@@ -7,9 +7,23 @@ from sglang.srt.entrypoints.http_server import launch_server
|
|
7
7
|
from sglang.srt.server_args import prepare_server_args
|
8
8
|
from sglang.srt.utils import kill_process_tree
|
9
9
|
|
10
|
+
MOVE_ENVS_WARN = """
|
11
|
+
########################################################################
|
12
|
+
# For contributors and developers: #
|
13
|
+
# Please move environment variable definitions to sglang.srt.environ #
|
14
|
+
# using the following pattern: #
|
15
|
+
# SGLANG_XXX = EnvBool(False) #
|
16
|
+
# #
|
17
|
+
########################################################################
|
18
|
+
"""
|
19
|
+
|
10
20
|
if __name__ == "__main__":
|
11
21
|
server_args = prepare_server_args(sys.argv[1:])
|
12
22
|
|
23
|
+
from sglang.srt.server_args import print_deprecated_warning
|
24
|
+
|
25
|
+
print_deprecated_warning(MOVE_ENVS_WARN)
|
26
|
+
|
13
27
|
try:
|
14
28
|
launch_server(server_args)
|
15
29
|
finally:
|
sglang/profiler.py
CHANGED
@@ -15,7 +15,7 @@ from typing import List, Optional
|
|
15
15
|
|
16
16
|
import requests
|
17
17
|
|
18
|
-
|
18
|
+
PROFILER_DIR = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
|
19
19
|
|
20
20
|
|
21
21
|
def _run_profile(
|
@@ -27,7 +27,7 @@ def _run_profile(
|
|
27
27
|
profile_by_stage: bool = False,
|
28
28
|
) -> str:
|
29
29
|
if output_dir is None:
|
30
|
-
output_dir =
|
30
|
+
output_dir = PROFILER_DIR
|
31
31
|
|
32
32
|
output_dir = os.path.normpath(output_dir)
|
33
33
|
output_dir = os.path.abspath(output_dir)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Adapted from https://github.com/thinking-machines-lab/batch_invariant_ops/blob/main/batch_invariant_ops/__init__.py
|
2
|
+
|
3
|
+
from .batch_invariant_ops import (
|
4
|
+
AttentionBlockSize,
|
5
|
+
disable_batch_invariant_mode,
|
6
|
+
enable_batch_invariant_mode,
|
7
|
+
get_batch_invariant_attention_block_size,
|
8
|
+
is_batch_invariant_mode_enabled,
|
9
|
+
log_softmax,
|
10
|
+
matmul_persistent,
|
11
|
+
mean_dim,
|
12
|
+
set_batch_invariant_mode,
|
13
|
+
)
|
14
|
+
|
15
|
+
__version__ = "0.1.0"
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"set_batch_invariant_mode",
|
19
|
+
"is_batch_invariant_mode_enabled",
|
20
|
+
"disable_batch_invariant_mode",
|
21
|
+
"enable_batch_invariant_mode",
|
22
|
+
"matmul_persistent",
|
23
|
+
"log_softmax",
|
24
|
+
"mean_dim",
|
25
|
+
"get_batch_invariant_attention_block_size",
|
26
|
+
"AttentionBlockSize",
|
27
|
+
]
|