sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/test/test_utils.py
CHANGED
@@ -9,15 +9,17 @@ import os
|
|
9
9
|
import random
|
10
10
|
import re
|
11
11
|
import subprocess
|
12
|
+
import sys
|
12
13
|
import threading
|
13
14
|
import time
|
14
15
|
import unittest
|
15
16
|
from concurrent.futures import ThreadPoolExecutor
|
16
17
|
from dataclasses import dataclass
|
18
|
+
from datetime import datetime
|
17
19
|
from functools import partial
|
18
20
|
from pathlib import Path
|
19
21
|
from types import SimpleNamespace
|
20
|
-
from typing import Awaitable, Callable, List, Optional, Tuple
|
22
|
+
from typing import Any, Awaitable, Callable, List, Optional, Tuple
|
21
23
|
|
22
24
|
import aiohttp
|
23
25
|
import numpy as np
|
@@ -41,8 +43,10 @@ from sglang.utils import get_exception_traceback
|
|
41
43
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
42
44
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
43
45
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
|
46
|
+
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
|
44
47
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
45
|
-
|
48
|
+
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
|
49
|
+
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
|
46
50
|
|
47
51
|
# MLA test models
|
48
52
|
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
@@ -52,6 +56,9 @@ DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instru
|
|
52
56
|
DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
|
53
57
|
DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
|
54
58
|
|
59
|
+
# NVFP4 models
|
60
|
+
DEFAULT_DEEPSEEK_NVFP4_MODEL_FOR_TEST = "nvidia/DeepSeek-R1-0528-FP4"
|
61
|
+
|
55
62
|
# FP8 models
|
56
63
|
DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
57
64
|
DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
@@ -71,7 +78,13 @@ DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
|
|
71
78
|
# EAGLE
|
72
79
|
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
73
80
|
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
74
|
-
|
81
|
+
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST_EAGLE3 = "meta-llama/Llama-3.1-8B-Instruct"
|
82
|
+
DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B"
|
83
|
+
DEFAULT_STANDALONE_SPECULATIVE_TARGET_MODEL_FOR_TEST = (
|
84
|
+
"meta-llama/Llama-3.1-8B-Instruct"
|
85
|
+
)
|
86
|
+
DEFAULT_STANDALONE_SPECULATIVE_DRAFT_MODEL_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
87
|
+
DEFAULT_NGRAM_SPECULATIVE_TARGET_MODEL_FOR_TEST = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
75
88
|
|
76
89
|
# Other use cases
|
77
90
|
DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
|
@@ -466,6 +479,25 @@ def try_cached_model(model_repo: str):
|
|
466
479
|
return model_dir if model_dir else model_repo
|
467
480
|
|
468
481
|
|
482
|
+
def popen_with_error_check(command: list[str], allow_exit: bool = False):
|
483
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
484
|
+
|
485
|
+
def _run_and_check():
|
486
|
+
stdout, stderr = process.communicate()
|
487
|
+
|
488
|
+
while process.poll() is None:
|
489
|
+
time.sleep(5)
|
490
|
+
|
491
|
+
if not allow_exit or process.returncode != 0:
|
492
|
+
raise Exception(
|
493
|
+
f"{command} exited with code {process.returncode}\n{stdout=}\n{stderr=}"
|
494
|
+
)
|
495
|
+
|
496
|
+
t = threading.Thread(target=_run_and_check)
|
497
|
+
t.start()
|
498
|
+
return process
|
499
|
+
|
500
|
+
|
469
501
|
def popen_launch_server(
|
470
502
|
model: str,
|
471
503
|
base_url: str,
|
@@ -476,6 +508,7 @@ def popen_launch_server(
|
|
476
508
|
return_stdout_stderr: Optional[tuple] = None,
|
477
509
|
device: str = "auto",
|
478
510
|
pd_separated: bool = False,
|
511
|
+
num_replicas: Optional[int] = None,
|
479
512
|
):
|
480
513
|
"""Launch a server process with automatic device detection.
|
481
514
|
|
@@ -493,7 +526,8 @@ def popen_launch_server(
|
|
493
526
|
_, host, port = base_url.split(":")
|
494
527
|
host = host[2:]
|
495
528
|
|
496
|
-
|
529
|
+
use_mixed_pd_engine = not pd_separated and num_replicas is not None
|
530
|
+
if pd_separated or use_mixed_pd_engine:
|
497
531
|
command = "sglang.launch_pd_server"
|
498
532
|
else:
|
499
533
|
command = "sglang.launch_server"
|
@@ -507,7 +541,7 @@ def popen_launch_server(
|
|
507
541
|
*[str(x) for x in other_args],
|
508
542
|
]
|
509
543
|
|
510
|
-
if pd_separated:
|
544
|
+
if pd_separated or use_mixed_pd_engine:
|
511
545
|
command.extend(
|
512
546
|
[
|
513
547
|
"--lb-host",
|
@@ -526,6 +560,15 @@ def popen_launch_server(
|
|
526
560
|
]
|
527
561
|
)
|
528
562
|
|
563
|
+
if use_mixed_pd_engine:
|
564
|
+
command.extend(
|
565
|
+
[
|
566
|
+
"--mixed",
|
567
|
+
"--num-replicas",
|
568
|
+
str(num_replicas),
|
569
|
+
]
|
570
|
+
)
|
571
|
+
|
529
572
|
if api_key:
|
530
573
|
command += ["--api-key", api_key]
|
531
574
|
|
@@ -534,11 +577,30 @@ def popen_launch_server(
|
|
534
577
|
if return_stdout_stderr:
|
535
578
|
process = subprocess.Popen(
|
536
579
|
command,
|
537
|
-
stdout=
|
538
|
-
stderr=
|
580
|
+
stdout=subprocess.PIPE,
|
581
|
+
stderr=subprocess.PIPE,
|
539
582
|
env=env,
|
540
583
|
text=True,
|
584
|
+
bufsize=1,
|
541
585
|
)
|
586
|
+
|
587
|
+
def _dump(src, sinks):
|
588
|
+
for line in iter(src.readline, ""):
|
589
|
+
for sink in sinks:
|
590
|
+
sink.write(line)
|
591
|
+
sink.flush()
|
592
|
+
src.close()
|
593
|
+
|
594
|
+
threading.Thread(
|
595
|
+
target=_dump,
|
596
|
+
args=(process.stdout, [return_stdout_stderr[0], sys.stdout]),
|
597
|
+
daemon=True,
|
598
|
+
).start()
|
599
|
+
threading.Thread(
|
600
|
+
target=_dump,
|
601
|
+
args=(process.stderr, [return_stdout_stderr[1], sys.stderr]),
|
602
|
+
daemon=True,
|
603
|
+
).start()
|
542
604
|
else:
|
543
605
|
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
|
544
606
|
|
@@ -842,6 +904,154 @@ def run_bench_serving(
|
|
842
904
|
return res
|
843
905
|
|
844
906
|
|
907
|
+
def run_score_benchmark(
|
908
|
+
model,
|
909
|
+
num_requests=100,
|
910
|
+
batch_size=5,
|
911
|
+
other_server_args=None,
|
912
|
+
need_warmup=False,
|
913
|
+
device="auto",
|
914
|
+
):
|
915
|
+
"""Score API benchmark function compatible with run_bench_serving pattern"""
|
916
|
+
if other_server_args is None:
|
917
|
+
other_server_args = []
|
918
|
+
|
919
|
+
if device == "auto":
|
920
|
+
device = auto_config_device()
|
921
|
+
|
922
|
+
# Launch the server (consistent with run_bench_serving)
|
923
|
+
base_url = DEFAULT_URL_FOR_TEST
|
924
|
+
process = popen_launch_server(
|
925
|
+
model,
|
926
|
+
base_url,
|
927
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
928
|
+
other_args=other_server_args,
|
929
|
+
)
|
930
|
+
|
931
|
+
async def _run_benchmark():
|
932
|
+
|
933
|
+
# Load tokenizer for generating test data
|
934
|
+
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
935
|
+
|
936
|
+
tokenizer = get_tokenizer(model)
|
937
|
+
|
938
|
+
# Score API configuration
|
939
|
+
score_query_tokens = 120
|
940
|
+
score_item_tokens = 180
|
941
|
+
score_label_token_ids = [9454, 2753] # Yes/No token IDs
|
942
|
+
special_token = "<|im_start|>"
|
943
|
+
|
944
|
+
def generate_text_with_token_count(num_tokens):
|
945
|
+
"""Generate text with precise token count using replicated token."""
|
946
|
+
text = special_token * num_tokens
|
947
|
+
actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
|
948
|
+
if actual_tokens != num_tokens:
|
949
|
+
text = special_token * (
|
950
|
+
num_tokens
|
951
|
+
// len(tokenizer.encode(special_token, add_special_tokens=False))
|
952
|
+
)
|
953
|
+
return text
|
954
|
+
|
955
|
+
if need_warmup:
|
956
|
+
warmup_data = {
|
957
|
+
"query": generate_text_with_token_count(score_query_tokens),
|
958
|
+
"items": [
|
959
|
+
generate_text_with_token_count(score_item_tokens) for _ in range(3)
|
960
|
+
],
|
961
|
+
"label_token_ids": score_label_token_ids,
|
962
|
+
"model": model,
|
963
|
+
"apply_softmax": True,
|
964
|
+
}
|
965
|
+
|
966
|
+
async with aiohttp.ClientSession() as session:
|
967
|
+
try:
|
968
|
+
await session.post(
|
969
|
+
f"{base_url}/v1/score",
|
970
|
+
json=warmup_data,
|
971
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
972
|
+
)
|
973
|
+
except:
|
974
|
+
pass # Ignore warmup errors
|
975
|
+
|
976
|
+
test_requests = []
|
977
|
+
for i in range(num_requests):
|
978
|
+
query = generate_text_with_token_count(score_query_tokens)
|
979
|
+
items = [
|
980
|
+
generate_text_with_token_count(score_item_tokens)
|
981
|
+
for _ in range(batch_size)
|
982
|
+
]
|
983
|
+
|
984
|
+
score_data = {
|
985
|
+
"query": query,
|
986
|
+
"items": items,
|
987
|
+
"label_token_ids": score_label_token_ids,
|
988
|
+
"model": model,
|
989
|
+
"apply_softmax": True,
|
990
|
+
}
|
991
|
+
test_requests.append(score_data)
|
992
|
+
|
993
|
+
start_time = time.monotonic()
|
994
|
+
successful_requests = 0
|
995
|
+
total_latency = 0
|
996
|
+
latencies = []
|
997
|
+
|
998
|
+
async with aiohttp.ClientSession() as session:
|
999
|
+
for request_data in test_requests:
|
1000
|
+
try:
|
1001
|
+
request_start = time.monotonic()
|
1002
|
+
async with session.post(
|
1003
|
+
f"{base_url}/v1/score",
|
1004
|
+
json=request_data,
|
1005
|
+
timeout=aiohttp.ClientTimeout(total=30),
|
1006
|
+
) as response:
|
1007
|
+
if response.status == 200:
|
1008
|
+
response_data = await response.json()
|
1009
|
+
request_end = time.monotonic()
|
1010
|
+
|
1011
|
+
if "scores" in response_data or "logprobs" in response_data:
|
1012
|
+
latency_ms = (request_end - request_start) * 1000
|
1013
|
+
latencies.append(latency_ms)
|
1014
|
+
total_latency += latency_ms
|
1015
|
+
successful_requests += 1
|
1016
|
+
except Exception:
|
1017
|
+
continue
|
1018
|
+
|
1019
|
+
end_time = time.monotonic()
|
1020
|
+
total_time = end_time - start_time
|
1021
|
+
|
1022
|
+
if successful_requests > 0:
|
1023
|
+
throughput = successful_requests / total_time
|
1024
|
+
avg_latency = total_latency / successful_requests
|
1025
|
+
latencies.sort()
|
1026
|
+
p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
|
1027
|
+
|
1028
|
+
return {
|
1029
|
+
"completed": successful_requests,
|
1030
|
+
"total_requests": num_requests,
|
1031
|
+
"throughput": throughput,
|
1032
|
+
"avg_latency_ms": avg_latency,
|
1033
|
+
"p95_latency_ms": p95_latency,
|
1034
|
+
"successful_requests": successful_requests,
|
1035
|
+
}
|
1036
|
+
else:
|
1037
|
+
return {
|
1038
|
+
"completed": 0,
|
1039
|
+
"total_requests": num_requests,
|
1040
|
+
"throughput": 0,
|
1041
|
+
"avg_latency_ms": 0,
|
1042
|
+
"p95_latency_ms": 0,
|
1043
|
+
"successful_requests": 0,
|
1044
|
+
}
|
1045
|
+
|
1046
|
+
try:
|
1047
|
+
res = asyncio.run(_run_benchmark())
|
1048
|
+
finally:
|
1049
|
+
kill_process_tree(process.pid)
|
1050
|
+
|
1051
|
+
assert res["completed"] == res["successful_requests"]
|
1052
|
+
return res
|
1053
|
+
|
1054
|
+
|
845
1055
|
def run_bench_serving_multi(
|
846
1056
|
model,
|
847
1057
|
base_url,
|
@@ -949,7 +1159,7 @@ def run_bench_offline_throughput(model, other_args):
|
|
949
1159
|
*[str(x) for x in other_args],
|
950
1160
|
]
|
951
1161
|
|
952
|
-
print(f"{command
|
1162
|
+
print(f"command={' '.join(command)}")
|
953
1163
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
954
1164
|
|
955
1165
|
try:
|
@@ -1363,6 +1573,41 @@ async def send_concurrent_generate_requests(
|
|
1363
1573
|
return await asyncio.gather(*tasks)
|
1364
1574
|
|
1365
1575
|
|
1576
|
+
async def send_concurrent_generate_requests_with_custom_params(
|
1577
|
+
base_url: str,
|
1578
|
+
custom_params: List[dict[str, Any]],
|
1579
|
+
) -> Tuple[int, Any]:
|
1580
|
+
"""Sends generate request concurrently with custom parameters and returns status code and response json tuple. Max concurrency is num_requests."""
|
1581
|
+
|
1582
|
+
base_payload = {
|
1583
|
+
"text": """
|
1584
|
+
System: You are a helpful assistant.
|
1585
|
+
User: What is the capital of France?
|
1586
|
+
Assistant: The capital of France is
|
1587
|
+
""",
|
1588
|
+
"sampling_params": {
|
1589
|
+
"temperature": 0,
|
1590
|
+
"max_new_tokens": 50,
|
1591
|
+
},
|
1592
|
+
}
|
1593
|
+
|
1594
|
+
async def async_generate_with_priority(req):
|
1595
|
+
async with aiohttp.ClientSession() as session:
|
1596
|
+
async with session.post(
|
1597
|
+
f"{base_url}/generate",
|
1598
|
+
json=req,
|
1599
|
+
) as response:
|
1600
|
+
resp_json = await response.json()
|
1601
|
+
return (response.status, resp_json)
|
1602
|
+
|
1603
|
+
tasks = []
|
1604
|
+
for c in custom_params:
|
1605
|
+
req = base_payload.copy()
|
1606
|
+
req.update(c)
|
1607
|
+
tasks.append(asyncio.create_task(async_generate_with_priority(req)))
|
1608
|
+
return await asyncio.gather(*tasks)
|
1609
|
+
|
1610
|
+
|
1366
1611
|
class CustomTestCase(unittest.TestCase):
|
1367
1612
|
def _callTestMethod(self, method):
|
1368
1613
|
max_retry = int(
|
@@ -1404,3 +1649,157 @@ def dump_bench_raw_result(
|
|
1404
1649
|
def _ensure_remove_suffix(text: str, suffix: str):
|
1405
1650
|
assert text.endswith(suffix)
|
1406
1651
|
return text.removesuffix(suffix)
|
1652
|
+
|
1653
|
+
|
1654
|
+
class ModelLaunchSettings:
|
1655
|
+
def __init__(
|
1656
|
+
self,
|
1657
|
+
model_path: str,
|
1658
|
+
tp_size: int = 1,
|
1659
|
+
extra_args: Optional[List[str]] = None,
|
1660
|
+
env: Optional[dict] = None,
|
1661
|
+
):
|
1662
|
+
self.model_path = model_path
|
1663
|
+
self.tp_size = tp_size
|
1664
|
+
self.extra_args = list(extra_args) if extra_args else []
|
1665
|
+
self.env = env
|
1666
|
+
|
1667
|
+
if self.tp_size > 1 and "--tp" not in self.extra_args:
|
1668
|
+
self.extra_args.extend(["--tp", str(self.tp_size)])
|
1669
|
+
|
1670
|
+
fixed_args = ["--enable-multimodal", "--trust-remote-code"]
|
1671
|
+
for fixed_arg in fixed_args:
|
1672
|
+
if fixed_arg not in self.extra_args:
|
1673
|
+
self.extra_args.append(fixed_arg)
|
1674
|
+
|
1675
|
+
|
1676
|
+
class ModelEvalMetrics:
|
1677
|
+
def __init__(self, accuracy: float, eval_time: float):
|
1678
|
+
self.accuracy = accuracy
|
1679
|
+
self.eval_time = eval_time
|
1680
|
+
|
1681
|
+
|
1682
|
+
def extract_trace_link_from_bench_one_batch_server_output(output: str) -> str:
|
1683
|
+
match = re.search(r"\[Profile\]\((.*?)\)", output)
|
1684
|
+
if match:
|
1685
|
+
trace_link = match.group(1)
|
1686
|
+
return trace_link
|
1687
|
+
return None
|
1688
|
+
|
1689
|
+
|
1690
|
+
def parse_models(model_string: str):
|
1691
|
+
return [model.strip() for model in model_string.split(",") if model.strip()]
|
1692
|
+
|
1693
|
+
|
1694
|
+
def check_evaluation_test_results(
|
1695
|
+
results,
|
1696
|
+
test_name,
|
1697
|
+
model_accuracy_thresholds,
|
1698
|
+
model_latency_thresholds=None,
|
1699
|
+
model_count=None,
|
1700
|
+
):
|
1701
|
+
"""
|
1702
|
+
results: list of tuple of (model_path, accuracy, latency)
|
1703
|
+
"""
|
1704
|
+
failed_models = []
|
1705
|
+
if model_latency_thresholds is not None:
|
1706
|
+
summary = " | model | status | score | score_threshold | latency | latency_threshold | \n"
|
1707
|
+
summary += "| ----- | ------ | ----- | --------------- | ------- | ----------------- | \n"
|
1708
|
+
else:
|
1709
|
+
summary = " | model | status | score | score_threshold | \n"
|
1710
|
+
summary += "| ----- | ------ | ----- | --------------- | \n"
|
1711
|
+
|
1712
|
+
results_dict = {res[0]: (res[1], res[2]) for res in results}
|
1713
|
+
|
1714
|
+
for model, accuracy_threshold in sorted(model_accuracy_thresholds.items()):
|
1715
|
+
latency_threshold = (
|
1716
|
+
model_latency_thresholds.get(model)
|
1717
|
+
if model_latency_thresholds is not None
|
1718
|
+
else 1e9
|
1719
|
+
)
|
1720
|
+
|
1721
|
+
if model in results_dict:
|
1722
|
+
accuracy, latency = results_dict[model]
|
1723
|
+
is_success = accuracy >= accuracy_threshold and latency <= latency_threshold
|
1724
|
+
status_emoji = "✅" if is_success else "❌"
|
1725
|
+
|
1726
|
+
if not is_success:
|
1727
|
+
if accuracy < accuracy_threshold:
|
1728
|
+
failed_models.append(
|
1729
|
+
f"\nScore Check Failed: {model}\n"
|
1730
|
+
f"Model {model} score ({accuracy:.4f}) is below threshold ({accuracy_threshold:.4f})"
|
1731
|
+
)
|
1732
|
+
if latency > latency_threshold:
|
1733
|
+
failed_models.append(
|
1734
|
+
f"\nLatency Check Failed: {model}\n"
|
1735
|
+
f"Model {model} latency ({latency:.4f}) is above threshold ({latency_threshold:.4f})"
|
1736
|
+
)
|
1737
|
+
|
1738
|
+
if model_latency_thresholds is not None:
|
1739
|
+
line = f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold} | {latency} | {latency_threshold}\n"
|
1740
|
+
else:
|
1741
|
+
line = (
|
1742
|
+
f"| {model} | {status_emoji} | {accuracy} | {accuracy_threshold}\n"
|
1743
|
+
)
|
1744
|
+
else:
|
1745
|
+
status_emoji = "❌"
|
1746
|
+
failed_models.append(f"Model failed to launch or be evaluated: {model}")
|
1747
|
+
if model_latency_thresholds is not None:
|
1748
|
+
line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold} | N/A | {latency_threshold}\n"
|
1749
|
+
else:
|
1750
|
+
line = f"| {model} | {status_emoji} | N/A | {accuracy_threshold}\n"
|
1751
|
+
|
1752
|
+
summary += line
|
1753
|
+
|
1754
|
+
print(summary)
|
1755
|
+
|
1756
|
+
if is_in_ci():
|
1757
|
+
write_github_step_summary(f"## {test_name}\n{summary}")
|
1758
|
+
|
1759
|
+
if failed_models:
|
1760
|
+
print("Some models failed the evaluation.")
|
1761
|
+
raise AssertionError("\n".join(failed_models))
|
1762
|
+
|
1763
|
+
|
1764
|
+
# Bench knobs for bench_one_batch_server (override by env)
|
1765
|
+
def _parse_int_list_env(name: str, default_val: str):
|
1766
|
+
val = os.environ.get(name, default_val)
|
1767
|
+
return [int(x) for x in val.split(",") if x]
|
1768
|
+
|
1769
|
+
|
1770
|
+
# Return filenames
|
1771
|
+
def find_traces_under_path(path: str) -> List[str]:
|
1772
|
+
results = []
|
1773
|
+
for _, dirs, files in os.walk(path):
|
1774
|
+
for file in files:
|
1775
|
+
if file.endswith(".trace.json.gz"):
|
1776
|
+
results.append(f"{file}")
|
1777
|
+
return results
|
1778
|
+
|
1779
|
+
|
1780
|
+
def write_results_to_json(model, metrics, mode="a"):
|
1781
|
+
result = {
|
1782
|
+
"timestamp": datetime.now().isoformat(),
|
1783
|
+
"model": model,
|
1784
|
+
"metrics": metrics,
|
1785
|
+
"score": metrics["score"],
|
1786
|
+
}
|
1787
|
+
|
1788
|
+
if "latency" in metrics:
|
1789
|
+
result["latency"] = (metrics.get("latency"),)
|
1790
|
+
|
1791
|
+
existing_results = []
|
1792
|
+
if mode == "a" and os.path.exists("results.json"):
|
1793
|
+
try:
|
1794
|
+
with open("results.json", "r") as f:
|
1795
|
+
existing_results = json.load(f)
|
1796
|
+
except json.JSONDecodeError:
|
1797
|
+
existing_results = []
|
1798
|
+
|
1799
|
+
if isinstance(existing_results, list):
|
1800
|
+
existing_results.append(result)
|
1801
|
+
else:
|
1802
|
+
existing_results = [result]
|
1803
|
+
|
1804
|
+
with open("results.json", "w") as f:
|
1805
|
+
json.dump(existing_results, f, indent=2)
|
sglang/utils.py
CHANGED
@@ -6,6 +6,7 @@ import logging
|
|
6
6
|
import os
|
7
7
|
import random
|
8
8
|
import socket
|
9
|
+
import ssl
|
9
10
|
import subprocess
|
10
11
|
import sys
|
11
12
|
import time
|
@@ -155,7 +156,15 @@ def http_request(
|
|
155
156
|
data = bytes(dumps(json), encoding="utf-8")
|
156
157
|
|
157
158
|
try:
|
158
|
-
|
159
|
+
if sys.version_info >= (3, 13):
|
160
|
+
# Python 3.13+: Use SSL context (cafile removed)
|
161
|
+
if verify and isinstance(verify, str):
|
162
|
+
context = ssl.create_default_context(cafile=verify)
|
163
|
+
else:
|
164
|
+
context = ssl.create_default_context()
|
165
|
+
resp = urllib.request.urlopen(req, data=data, context=context)
|
166
|
+
else:
|
167
|
+
resp = urllib.request.urlopen(req, data=data, cafile=verify)
|
159
168
|
return HttpResponse(resp)
|
160
169
|
except urllib.error.HTTPError as e:
|
161
170
|
return HttpResponse(e)
|
@@ -472,11 +481,22 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
|
472
481
|
class TypeBasedDispatcher:
|
473
482
|
def __init__(self, mapping: List[Tuple[Type, Callable]]):
|
474
483
|
self._mapping = mapping
|
484
|
+
self._fallback_fn = None
|
485
|
+
|
486
|
+
def add_fallback_fn(self, fallback_fn: Callable):
|
487
|
+
self._fallback_fn = fallback_fn
|
488
|
+
|
489
|
+
def __iadd__(self, other: "TypeBasedDispatcher"):
|
490
|
+
self._mapping.extend(other._mapping)
|
491
|
+
return self
|
475
492
|
|
476
493
|
def __call__(self, obj: Any):
|
477
494
|
for ty, fn in self._mapping:
|
478
495
|
if isinstance(obj, ty):
|
479
496
|
return fn(obj)
|
497
|
+
|
498
|
+
if self._fallback_fn is not None:
|
499
|
+
return self._fallback_fn(obj)
|
480
500
|
raise ValueError(f"Invalid object: {obj}")
|
481
501
|
|
482
502
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.3.post1"
|