sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +330 -156
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +8 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +134 -23
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +70 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +66 -66
- sglang/srt/entrypoints/grpc_server.py +431 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +120 -8
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +42 -4
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +18 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +4 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +12 -8
- sglang/srt/layers/attention/flashinfer_backend.py +248 -21
- sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +11 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +45 -15
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +147 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
- sglang/srt/layers/moe/ep_moe/layer.py +119 -397
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +17 -1
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +5 -30
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +673 -16
- sglang/srt/layers/sampler.py +36 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +66 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +399 -499
- sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +378 -90
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +4 -1
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +435 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +28 -23
- sglang/srt/model_executor/model_runner.py +379 -139
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +273 -98
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +14 -37
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +5 -5
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3_moe.py +19 -35
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +6 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +577 -73
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +38 -28
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +192 -47
- sglang/srt/utils/hf_transformers_utils.py +40 -17
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +232 -99
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
|
@@ -51,6 +51,7 @@ import logging
|
|
|
51
51
|
import multiprocessing
|
|
52
52
|
import os
|
|
53
53
|
import time
|
|
54
|
+
from types import SimpleNamespace
|
|
54
55
|
from typing import Tuple
|
|
55
56
|
|
|
56
57
|
import numpy as np
|
|
@@ -71,7 +72,10 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
|
|
71
72
|
from sglang.srt.utils import (
|
|
72
73
|
configure_logger,
|
|
73
74
|
get_bool_env_var,
|
|
75
|
+
is_cuda_alike,
|
|
76
|
+
is_xpu,
|
|
74
77
|
kill_process_tree,
|
|
78
|
+
maybe_reindex_device_id,
|
|
75
79
|
require_mlp_sync,
|
|
76
80
|
require_mlp_tp_gather,
|
|
77
81
|
set_gpu_proc_affinity,
|
|
@@ -79,6 +83,15 @@ from sglang.srt.utils import (
|
|
|
79
83
|
)
|
|
80
84
|
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
|
81
85
|
|
|
86
|
+
profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
|
|
87
|
+
profiler_activity
|
|
88
|
+
for available, profiler_activity in [
|
|
89
|
+
(is_cuda_alike(), torch.profiler.ProfilerActivity.CUDA),
|
|
90
|
+
(is_xpu(), torch.profiler.ProfilerActivity.XPU),
|
|
91
|
+
]
|
|
92
|
+
if available
|
|
93
|
+
]
|
|
94
|
+
|
|
82
95
|
|
|
83
96
|
@dataclasses.dataclass
|
|
84
97
|
class BenchArgs:
|
|
@@ -147,7 +160,7 @@ class BenchArgs:
|
|
|
147
160
|
)
|
|
148
161
|
|
|
149
162
|
|
|
150
|
-
def load_model(server_args, port_args, tp_rank):
|
|
163
|
+
def load_model(server_args, port_args, gpu_id, tp_rank):
|
|
151
164
|
suppress_other_loggers()
|
|
152
165
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
|
153
166
|
moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
|
|
@@ -156,7 +169,7 @@ def load_model(server_args, port_args, tp_rank):
|
|
|
156
169
|
model_runner = ModelRunner(
|
|
157
170
|
model_config=model_config,
|
|
158
171
|
mem_fraction_static=server_args.mem_fraction_static,
|
|
159
|
-
gpu_id=
|
|
172
|
+
gpu_id=gpu_id,
|
|
160
173
|
tp_rank=tp_rank,
|
|
161
174
|
tp_size=server_args.tp_size,
|
|
162
175
|
moe_ep_rank=moe_ep_rank,
|
|
@@ -204,7 +217,6 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
|
|
|
204
217
|
origin_input_ids=tmp_input_ids,
|
|
205
218
|
sampling_params=sampling_params,
|
|
206
219
|
)
|
|
207
|
-
req.prefix_indices = []
|
|
208
220
|
req.fill_ids = req.origin_input_ids
|
|
209
221
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
|
210
222
|
req.logprob_start_len = len(req.origin_input_ids) - 1
|
|
@@ -248,7 +260,6 @@ def prepare_synthetic_inputs_for_latency_test(
|
|
|
248
260
|
origin_input_ids=list(input_ids[i]),
|
|
249
261
|
sampling_params=sampling_params,
|
|
250
262
|
)
|
|
251
|
-
req.prefix_indices = []
|
|
252
263
|
req.fill_ids = req.origin_input_ids
|
|
253
264
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
|
254
265
|
req.logprob_start_len = len(req.origin_input_ids) - 1
|
|
@@ -259,11 +270,18 @@ def prepare_synthetic_inputs_for_latency_test(
|
|
|
259
270
|
|
|
260
271
|
@torch.no_grad
|
|
261
272
|
def extend(reqs, model_runner):
|
|
273
|
+
# Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
|
|
274
|
+
dummy_tree_cache = SimpleNamespace(
|
|
275
|
+
page_size=model_runner.server_args.page_size,
|
|
276
|
+
device=model_runner.device,
|
|
277
|
+
token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
|
|
278
|
+
)
|
|
279
|
+
|
|
262
280
|
batch = ScheduleBatch.init_new(
|
|
263
281
|
reqs=reqs,
|
|
264
282
|
req_to_token_pool=model_runner.req_to_token_pool,
|
|
265
283
|
token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
|
|
266
|
-
tree_cache=
|
|
284
|
+
tree_cache=dummy_tree_cache,
|
|
267
285
|
model_config=model_runner.model_config,
|
|
268
286
|
enable_overlap=False,
|
|
269
287
|
spec_algorithm=SpeculativeAlgorithm.NONE,
|
|
@@ -302,6 +320,7 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
|
|
|
302
320
|
speculative_num_draft_tokens=None,
|
|
303
321
|
require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
|
|
304
322
|
disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
|
|
323
|
+
offload_tags=set(),
|
|
305
324
|
)
|
|
306
325
|
|
|
307
326
|
|
|
@@ -333,6 +352,7 @@ def correctness_test(
|
|
|
333
352
|
server_args,
|
|
334
353
|
port_args,
|
|
335
354
|
bench_args,
|
|
355
|
+
gpu_id,
|
|
336
356
|
tp_rank,
|
|
337
357
|
):
|
|
338
358
|
# Configure the logger
|
|
@@ -340,7 +360,7 @@ def correctness_test(
|
|
|
340
360
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
|
341
361
|
|
|
342
362
|
# Load the model
|
|
343
|
-
model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
|
|
363
|
+
model_runner, tokenizer = load_model(server_args, port_args, gpu_id, tp_rank)
|
|
344
364
|
|
|
345
365
|
# Prepare inputs
|
|
346
366
|
custom_prompts = _read_prompts_from_file(bench_args.prompt_filename, rank_print)
|
|
@@ -418,10 +438,7 @@ def latency_test_run_once(
|
|
|
418
438
|
profiler = None
|
|
419
439
|
if profile:
|
|
420
440
|
profiler = torch.profiler.profile(
|
|
421
|
-
activities=
|
|
422
|
-
torch.profiler.ProfilerActivity.CPU,
|
|
423
|
-
torch.profiler.ProfilerActivity.CUDA,
|
|
424
|
-
],
|
|
441
|
+
activities=profile_activities,
|
|
425
442
|
with_stack=True,
|
|
426
443
|
record_shapes=profile_record_shapes,
|
|
427
444
|
)
|
|
@@ -454,10 +471,7 @@ def latency_test_run_once(
|
|
|
454
471
|
if profile and i == output_len / 2:
|
|
455
472
|
profiler = None
|
|
456
473
|
profiler = torch.profiler.profile(
|
|
457
|
-
activities=
|
|
458
|
-
torch.profiler.ProfilerActivity.CPU,
|
|
459
|
-
torch.profiler.ProfilerActivity.CUDA,
|
|
460
|
-
],
|
|
474
|
+
activities=profile_activities,
|
|
461
475
|
with_stack=True,
|
|
462
476
|
record_shapes=profile_record_shapes,
|
|
463
477
|
)
|
|
@@ -506,20 +520,23 @@ def latency_test(
|
|
|
506
520
|
server_args,
|
|
507
521
|
port_args,
|
|
508
522
|
bench_args,
|
|
523
|
+
gpu_id,
|
|
509
524
|
tp_rank,
|
|
510
525
|
):
|
|
511
526
|
initialize_moe_config(server_args)
|
|
512
527
|
|
|
513
528
|
# Set CPU affinity
|
|
514
529
|
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
|
|
515
|
-
set_gpu_proc_affinity(
|
|
530
|
+
set_gpu_proc_affinity(
|
|
531
|
+
server_args.pp_size, server_args.tp_size, server_args.nnodes, tp_rank
|
|
532
|
+
)
|
|
516
533
|
|
|
517
534
|
# Configure the logger
|
|
518
535
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
|
519
536
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
|
520
537
|
|
|
521
538
|
# Load the model
|
|
522
|
-
model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
|
|
539
|
+
model_runner, tokenizer = load_model(server_args, port_args, gpu_id, tp_rank)
|
|
523
540
|
|
|
524
541
|
# Prepare inputs for warm up
|
|
525
542
|
reqs = prepare_synthetic_inputs_for_latency_test(
|
|
@@ -621,21 +638,23 @@ def main(server_args, bench_args):
|
|
|
621
638
|
port_args = PortArgs.init_new(server_args)
|
|
622
639
|
|
|
623
640
|
if server_args.tp_size == 1:
|
|
624
|
-
work_func(server_args, port_args, bench_args, 0)
|
|
641
|
+
work_func(server_args, port_args, bench_args, 0, 0)
|
|
625
642
|
else:
|
|
626
643
|
workers = []
|
|
627
644
|
for tp_rank in range(server_args.tp_size):
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
645
|
+
with maybe_reindex_device_id(tp_rank) as gpu_id:
|
|
646
|
+
proc = multiprocessing.Process(
|
|
647
|
+
target=work_func,
|
|
648
|
+
args=(
|
|
649
|
+
server_args,
|
|
650
|
+
port_args,
|
|
651
|
+
bench_args,
|
|
652
|
+
gpu_id,
|
|
653
|
+
tp_rank,
|
|
654
|
+
),
|
|
655
|
+
)
|
|
656
|
+
proc.start()
|
|
657
|
+
workers.append(proc)
|
|
639
658
|
|
|
640
659
|
for proc in workers:
|
|
641
660
|
proc.join()
|
sglang/bench_one_batch_server.py
CHANGED
|
@@ -16,6 +16,7 @@ import argparse
|
|
|
16
16
|
import dataclasses
|
|
17
17
|
import itertools
|
|
18
18
|
import json
|
|
19
|
+
import logging
|
|
19
20
|
import multiprocessing
|
|
20
21
|
import os
|
|
21
22
|
import random
|
|
@@ -25,8 +26,10 @@ from typing import List, Optional, Tuple
|
|
|
25
26
|
import numpy as np
|
|
26
27
|
import requests
|
|
27
28
|
from pydantic import BaseModel
|
|
29
|
+
from transformers import AutoProcessor, PreTrainedTokenizer
|
|
28
30
|
|
|
29
31
|
from sglang.bench_serving import (
|
|
32
|
+
get_processor,
|
|
30
33
|
get_tokenizer,
|
|
31
34
|
sample_mmmu_requests,
|
|
32
35
|
sample_random_requests,
|
|
@@ -37,6 +40,8 @@ from sglang.srt.server_args import ServerArgs
|
|
|
37
40
|
from sglang.srt.utils import is_blackwell, kill_process_tree
|
|
38
41
|
from sglang.test.test_utils import is_in_ci, write_github_step_summary
|
|
39
42
|
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
40
45
|
|
|
41
46
|
class ProfileLinks(BaseModel):
|
|
42
47
|
"""Pydantic model for profile trace links."""
|
|
@@ -104,8 +109,14 @@ Note: To view the traces through perfetto-ui, please:
|
|
|
104
109
|
if self.profile_links.extend or self.profile_links.decode:
|
|
105
110
|
# Create a combined link or use the first available one
|
|
106
111
|
trace_files = [self.profile_links.extend, self.profile_links.decode]
|
|
112
|
+
if any(trace_file is None for trace_file in trace_files):
|
|
113
|
+
logger.error("Some trace files are None", f"{trace_files=}")
|
|
107
114
|
trace_files_relay_links = [
|
|
108
|
-
|
|
115
|
+
(
|
|
116
|
+
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
|
|
117
|
+
if trace_file
|
|
118
|
+
else "N/A"
|
|
119
|
+
)
|
|
109
120
|
for trace_file in trace_files
|
|
110
121
|
]
|
|
111
122
|
|
|
@@ -114,30 +125,29 @@ Note: To view the traces through perfetto-ui, please:
|
|
|
114
125
|
# Build the row
|
|
115
126
|
return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
|
|
116
127
|
|
|
117
|
-
@classmethod
|
|
118
|
-
def generate_markdown_report(
|
|
119
|
-
cls, trace_dir, results: List["BenchmarkResult"]
|
|
120
|
-
) -> str:
|
|
121
|
-
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
|
|
122
|
-
import os
|
|
123
128
|
|
|
124
|
-
|
|
129
|
+
def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
|
|
130
|
+
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
|
|
131
|
+
import os
|
|
125
132
|
|
|
126
|
-
|
|
127
|
-
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
|
|
128
|
-
# )
|
|
129
|
-
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
|
|
130
|
-
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
|
|
133
|
+
summary = f"### {results[0].model_path}\n"
|
|
131
134
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
# base_url = "https://github.com/sgl-project/ci-data/traces"
|
|
138
|
-
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
|
|
135
|
+
# summary += (
|
|
136
|
+
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
|
|
137
|
+
# )
|
|
138
|
+
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
|
|
139
|
+
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
|
|
139
140
|
|
|
140
|
-
|
|
141
|
+
# all results should share the same isl & osl
|
|
142
|
+
for result in results:
|
|
143
|
+
base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
|
|
144
|
+
relay_base = os.getenv(
|
|
145
|
+
"PERFETTO_RELAY_URL",
|
|
146
|
+
"",
|
|
147
|
+
).rstrip("/")
|
|
148
|
+
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
|
|
149
|
+
|
|
150
|
+
return summary
|
|
141
151
|
|
|
142
152
|
|
|
143
153
|
@dataclasses.dataclass
|
|
@@ -288,7 +298,7 @@ def run_one_case(
|
|
|
288
298
|
input_len_step_percentage: float,
|
|
289
299
|
run_name: str,
|
|
290
300
|
result_filename: str,
|
|
291
|
-
tokenizer,
|
|
301
|
+
tokenizer: PreTrainedTokenizer | AutoProcessor,
|
|
292
302
|
dataset_name="",
|
|
293
303
|
profile: bool = False,
|
|
294
304
|
profile_steps: int = 3,
|
|
@@ -302,9 +312,8 @@ def run_one_case(
|
|
|
302
312
|
if dataset_name == "mmmu":
|
|
303
313
|
input_requests = sample_mmmu_requests(
|
|
304
314
|
num_requests=batch_size,
|
|
305
|
-
|
|
315
|
+
processor=tokenizer,
|
|
306
316
|
fixed_output_len=output_len,
|
|
307
|
-
apply_chat_template=True,
|
|
308
317
|
random_sample=False,
|
|
309
318
|
)
|
|
310
319
|
elif dataset_name == "random":
|
|
@@ -364,6 +373,8 @@ def run_one_case(
|
|
|
364
373
|
if dataset_name == "mmmu":
|
|
365
374
|
# vlm
|
|
366
375
|
input_ids = []
|
|
376
|
+
# for vlms, tokenizer is an instance of AutoProcessor
|
|
377
|
+
tokenizer = tokenizer.tokenizer
|
|
367
378
|
for input_req in input_requests:
|
|
368
379
|
input_ids += [tokenizer.encode(input_req.prompt)]
|
|
369
380
|
payload["image_data"] = [req.image_data for req in input_requests]
|
|
@@ -609,7 +620,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
|
609
620
|
tokenizer_path = server_info["tokenizer_path"]
|
|
610
621
|
elif "prefill" in server_info:
|
|
611
622
|
tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
|
|
612
|
-
|
|
623
|
+
|
|
624
|
+
if bench_args.dataset_name == "mmmu":
|
|
625
|
+
# mmmu implies this is a MLLM
|
|
626
|
+
tokenizer = get_processor(tokenizer_path)
|
|
627
|
+
else:
|
|
628
|
+
tokenizer = get_tokenizer(tokenizer_path)
|
|
613
629
|
|
|
614
630
|
# warmup
|
|
615
631
|
if not bench_args.skip_warmup:
|