sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +192 -113
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +132 -57
- sglang/srt/entrypoints/openai/protocol.py +115 -7
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +207 -58
- sglang/srt/entrypoints/openai/serving_completions.py +17 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +49 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +106 -82
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +53 -7
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +225 -57
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +78 -49
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +215 -314
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +147 -19
- sglang/srt/managers/scheduler.py +501 -304
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +321 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +15 -21
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +58 -34
- sglang/srt/mem_cache/hiradix_cache.py +227 -80
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -223
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +519 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +55 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +98 -57
- sglang/srt/model_executor/model_runner.py +433 -158
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +833 -152
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +14 -5
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +124 -14
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +26 -5
- sglang/srt/models/qwen3_moe.py +71 -12
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +10 -3
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1030 -254
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +253 -136
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +445 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +22 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
# Adapted from vLLM's OpenAIServingResponses
|
3
3
|
"""Handler for /v1/responses requests"""
|
4
|
+
from __future__ import annotations
|
4
5
|
|
5
6
|
import asyncio
|
6
7
|
import copy
|
@@ -9,7 +10,7 @@ import logging
|
|
9
10
|
import time
|
10
11
|
from contextlib import AsyncExitStack
|
11
12
|
from http import HTTPStatus
|
12
|
-
from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
|
13
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
|
13
14
|
|
14
15
|
import jinja2
|
15
16
|
import openai.types.responses as openai_responses_types
|
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
|
|
54
55
|
from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
|
55
56
|
from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
|
56
57
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
57
|
-
from sglang.srt.
|
58
|
-
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
59
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
58
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
60
59
|
from sglang.srt.utils import random_uuid
|
61
60
|
|
61
|
+
if TYPE_CHECKING:
|
62
|
+
from sglang.srt.managers.template_manager import TemplateManager
|
63
|
+
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
64
|
+
|
62
65
|
logger = logging.getLogger(__name__)
|
63
66
|
|
64
67
|
|
@@ -120,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
120
123
|
|
121
124
|
self.background_tasks: dict[str, asyncio.Task] = {}
|
122
125
|
|
126
|
+
# error helpers dedicated for v1/responses
|
127
|
+
def create_error_response(
|
128
|
+
self,
|
129
|
+
message: str,
|
130
|
+
err_type: str = "invalid_request_error",
|
131
|
+
status_code: int = 400,
|
132
|
+
param: Optional[str] = None,
|
133
|
+
) -> ORJSONResponse:
|
134
|
+
nested_error = {
|
135
|
+
"message": message,
|
136
|
+
"type": err_type,
|
137
|
+
"param": param,
|
138
|
+
"code": status_code,
|
139
|
+
}
|
140
|
+
return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
|
141
|
+
|
142
|
+
def create_streaming_error_response(
|
143
|
+
self,
|
144
|
+
message: str,
|
145
|
+
err_type: str = "BadRequestError",
|
146
|
+
status_code: int = 400,
|
147
|
+
) -> str:
|
148
|
+
return json.dumps(
|
149
|
+
{
|
150
|
+
"error": {
|
151
|
+
"message": message,
|
152
|
+
"type": err_type,
|
153
|
+
"param": None,
|
154
|
+
"code": status_code,
|
155
|
+
}
|
156
|
+
}
|
157
|
+
)
|
158
|
+
|
123
159
|
def _request_id_prefix(self) -> str:
|
124
160
|
return "resp_"
|
125
161
|
|
@@ -242,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
242
278
|
sampling_params=sampling_params,
|
243
279
|
stream=request.stream,
|
244
280
|
rid=request.request_id,
|
281
|
+
extra_key=self._compute_extra_key(request),
|
245
282
|
background=request.background,
|
246
283
|
)
|
247
284
|
|
@@ -830,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
830
867
|
|
831
868
|
async for ctx in result_generator:
|
832
869
|
|
870
|
+
# Only process context objects that implement the `is_expecting_start()` method,
|
871
|
+
# which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
|
872
|
+
# Contexts without this method are skipped, as they do not represent a new turn
|
873
|
+
# or are not compatible with per-turn handling in the /v1/responses endpoint.
|
874
|
+
if not hasattr(ctx, "is_expecting_start"):
|
875
|
+
continue
|
876
|
+
|
833
877
|
if ctx.is_expecting_start():
|
834
878
|
current_output_index += 1
|
835
879
|
sent_output_item_added = False
|
@@ -1247,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
|
|
1247
1291
|
sampling_params=sampling_params,
|
1248
1292
|
stream=adapted_request.stream,
|
1249
1293
|
rid=request_id,
|
1294
|
+
extra_key=adapted_request.extra_key,
|
1250
1295
|
return_logprob=adapted_request.return_logprob,
|
1251
1296
|
logprob_start_len=adapted_request.logprob_start_len,
|
1252
1297
|
top_logprobs_num=adapted_request.top_logprobs_num,
|
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
|
|
25
25
|
def _convert_to_internal_request(
|
26
26
|
self,
|
27
27
|
request: ScoringRequest,
|
28
|
+
raw_request: Request = None,
|
28
29
|
) -> tuple[ScoringRequest, ScoringRequest]:
|
29
30
|
"""Convert OpenAI scoring request to internal format"""
|
30
31
|
# For scoring, we pass the request directly as the tokenizer_manager
|
sglang/srt/environ.py
ADDED
@@ -0,0 +1,285 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
import warnings
|
4
|
+
from contextlib import ExitStack, contextmanager
|
5
|
+
from typing import Any
|
6
|
+
|
7
|
+
|
8
|
+
class EnvField:
|
9
|
+
def __init__(self, default: Any):
|
10
|
+
self.default = default
|
11
|
+
# NOTE: we use None to indicate whether the value is set or not
|
12
|
+
# If the value is manually set to None, we need mark it as _set_to_none.
|
13
|
+
# Always use clear() to reset the value, which leads to the default fallback.
|
14
|
+
self._set_to_none = False
|
15
|
+
|
16
|
+
def __set_name__(self, owner, name):
|
17
|
+
self.name = name
|
18
|
+
|
19
|
+
def parse(self, value: str) -> Any:
|
20
|
+
raise NotImplementedError()
|
21
|
+
|
22
|
+
def get(self) -> Any:
|
23
|
+
value = os.getenv(self.name)
|
24
|
+
if self._set_to_none:
|
25
|
+
assert value is None
|
26
|
+
return None
|
27
|
+
|
28
|
+
if value is None:
|
29
|
+
return self.default
|
30
|
+
|
31
|
+
try:
|
32
|
+
return self.parse(value)
|
33
|
+
except ValueError as e:
|
34
|
+
warnings.warn(
|
35
|
+
f'Invalid value for {self.name}: {e}, using default "{self.default}"'
|
36
|
+
)
|
37
|
+
return self.default
|
38
|
+
|
39
|
+
def is_set(self):
|
40
|
+
# NOTE: If None is manually set, it is considered as set.
|
41
|
+
return self.name in os.environ or self._set_to_none
|
42
|
+
|
43
|
+
def get_set_value_or(self, or_value: Any):
|
44
|
+
# NOTE: Ugly usage, but only way to get custom default value.
|
45
|
+
return self.get() if self.is_set() else or_value
|
46
|
+
|
47
|
+
def set(self, value: Any):
|
48
|
+
if value is None:
|
49
|
+
self._set_to_none = True
|
50
|
+
os.environ.pop(self.name, None)
|
51
|
+
else:
|
52
|
+
self._set_to_none = False
|
53
|
+
os.environ[self.name] = str(value)
|
54
|
+
|
55
|
+
@contextmanager
|
56
|
+
def override(self, value: Any):
|
57
|
+
backup_present = self.name in os.environ
|
58
|
+
backup_value = os.environ.get(self.name)
|
59
|
+
backup_set_to_none = self._set_to_none
|
60
|
+
self.set(value)
|
61
|
+
yield
|
62
|
+
if backup_present:
|
63
|
+
os.environ[self.name] = backup_value
|
64
|
+
else:
|
65
|
+
os.environ.pop(self.name, None)
|
66
|
+
self._set_to_none = backup_set_to_none
|
67
|
+
|
68
|
+
def clear(self):
|
69
|
+
os.environ.pop(self.name, None)
|
70
|
+
self._set_to_none = False
|
71
|
+
|
72
|
+
@property
|
73
|
+
def value(self):
|
74
|
+
return self.get()
|
75
|
+
|
76
|
+
|
77
|
+
class EnvStr(EnvField):
|
78
|
+
def parse(self, value: str) -> str:
|
79
|
+
return value
|
80
|
+
|
81
|
+
|
82
|
+
class EnvBool(EnvField):
|
83
|
+
def parse(self, value: str) -> bool:
|
84
|
+
value = value.lower()
|
85
|
+
if value in ["true", "1", "yes", "y"]:
|
86
|
+
return True
|
87
|
+
if value in ["false", "0", "no", "n"]:
|
88
|
+
return False
|
89
|
+
raise ValueError(f'"{value}" is not a valid boolean value')
|
90
|
+
|
91
|
+
|
92
|
+
class EnvInt(EnvField):
|
93
|
+
def parse(self, value: str) -> int:
|
94
|
+
try:
|
95
|
+
return int(value)
|
96
|
+
except ValueError:
|
97
|
+
raise ValueError(f'"{value}" is not a valid integer value')
|
98
|
+
|
99
|
+
|
100
|
+
class EnvFloat(EnvField):
|
101
|
+
def parse(self, value: str) -> float:
|
102
|
+
try:
|
103
|
+
return float(value)
|
104
|
+
except ValueError:
|
105
|
+
raise ValueError(f'"{value}" is not a valid float value')
|
106
|
+
|
107
|
+
|
108
|
+
class Envs:
|
109
|
+
# fmt: off
|
110
|
+
|
111
|
+
# Model & File Download
|
112
|
+
SGLANG_USE_MODELSCOPE = EnvBool(False)
|
113
|
+
|
114
|
+
# Test & Debug
|
115
|
+
SGLANG_IS_IN_CI = EnvBool(False)
|
116
|
+
SGLANG_AMD_CI = EnvBool(False)
|
117
|
+
SGLANG_TEST_RETRACT = EnvBool(False)
|
118
|
+
SGLANG_SET_CPU_AFFINITY = EnvBool(False)
|
119
|
+
SGLANG_PROFILE_WITH_STACK = EnvBool(True)
|
120
|
+
SGLANG_RECORD_STEP_TIME = EnvBool(False)
|
121
|
+
SGLANG_GC_LOG = EnvBool(False)
|
122
|
+
SGLANG_FORCE_SHUTDOWN = EnvBool(False)
|
123
|
+
SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
|
124
|
+
SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
|
125
|
+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
|
126
|
+
SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
|
127
|
+
SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
|
128
|
+
SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
|
129
|
+
SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
|
130
|
+
|
131
|
+
# Model Parallel
|
132
|
+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
|
133
|
+
|
134
|
+
# Constrained Decoding
|
135
|
+
SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
|
136
|
+
SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
|
137
|
+
|
138
|
+
# Hi-Cache
|
139
|
+
SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
|
140
|
+
|
141
|
+
# Mooncake KV Transfer
|
142
|
+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
|
143
|
+
ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
|
144
|
+
|
145
|
+
# AMD & ROCm
|
146
|
+
SGLANG_USE_AITER = EnvBool(False)
|
147
|
+
SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
|
148
|
+
|
149
|
+
# Quantization
|
150
|
+
SGLANG_INT4_WEIGHT = EnvBool(False)
|
151
|
+
SGLANG_CPU_QUANTIZATION = EnvBool(False)
|
152
|
+
SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
|
153
|
+
SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
|
154
|
+
|
155
|
+
# Flashinfer
|
156
|
+
SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
|
157
|
+
SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
|
158
|
+
|
159
|
+
# Triton
|
160
|
+
SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
|
161
|
+
|
162
|
+
# Torch Compile
|
163
|
+
SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
|
164
|
+
|
165
|
+
# EPLB
|
166
|
+
SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
|
167
|
+
SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
|
168
|
+
SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
|
169
|
+
SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
|
170
|
+
|
171
|
+
# TBO
|
172
|
+
SGLANG_TBO_DEBUG = EnvBool(False)
|
173
|
+
|
174
|
+
# DeepGemm
|
175
|
+
SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
|
176
|
+
SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
|
177
|
+
SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
|
178
|
+
SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
|
179
|
+
SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
|
180
|
+
SGLANG_DG_USE_NVRTC = EnvBool(False)
|
181
|
+
SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
|
182
|
+
|
183
|
+
# sgl-kernel
|
184
|
+
SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
|
185
|
+
|
186
|
+
# vLLM dependencies
|
187
|
+
USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
|
188
|
+
USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
|
189
|
+
|
190
|
+
USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
|
191
|
+
RETURN_ORIGINAL_LOGPROB = EnvBool(False)
|
192
|
+
SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
|
193
|
+
SGLANG_MOE_PADDING = EnvBool(False)
|
194
|
+
SGLANG_CUTLASS_MOE = EnvBool(False)
|
195
|
+
HF_HUB_DISABLE_XET = EnvBool(False)
|
196
|
+
DISABLE_OPENAPI_DOC = EnvBool(False)
|
197
|
+
SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
|
198
|
+
SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
|
199
|
+
SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
|
200
|
+
SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
|
201
|
+
SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
|
202
|
+
|
203
|
+
# Deterministic inference
|
204
|
+
SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
|
205
|
+
SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
|
206
|
+
SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
|
207
|
+
SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
|
208
|
+
SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
|
209
|
+
|
210
|
+
# fmt: on
|
211
|
+
|
212
|
+
|
213
|
+
envs = Envs()
|
214
|
+
|
215
|
+
|
216
|
+
def _convert_SGL_to_SGLANG():
|
217
|
+
for key, value in os.environ.items():
|
218
|
+
if key.startswith("SGL_"):
|
219
|
+
new_key = key.replace("SGL_", "SGLANG_", 1)
|
220
|
+
warnings.warn(
|
221
|
+
f"Environment variable {key} is deprecated, please use {new_key}"
|
222
|
+
)
|
223
|
+
os.environ[new_key] = value
|
224
|
+
|
225
|
+
|
226
|
+
_convert_SGL_to_SGLANG()
|
227
|
+
|
228
|
+
|
229
|
+
def example_with_exit_stack():
|
230
|
+
# Use this style of context manager in unit test
|
231
|
+
exit_stack = ExitStack()
|
232
|
+
exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
|
233
|
+
assert envs.SGLANG_TEST_RETRACT.value is False
|
234
|
+
exit_stack.close()
|
235
|
+
assert envs.SGLANG_TEST_RETRACT.value is None
|
236
|
+
|
237
|
+
|
238
|
+
def example_with_subprocess():
|
239
|
+
command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
|
240
|
+
with envs.SGLANG_TEST_RETRACT.override(True):
|
241
|
+
process = subprocess.Popen(
|
242
|
+
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
243
|
+
)
|
244
|
+
process.wait()
|
245
|
+
output = process.stdout.read().decode("utf-8").strip()
|
246
|
+
assert output == "True"
|
247
|
+
|
248
|
+
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
249
|
+
output = process.stdout.read().decode("utf-8").strip()
|
250
|
+
assert output == "None"
|
251
|
+
|
252
|
+
|
253
|
+
def examples():
|
254
|
+
# Example usage for envs
|
255
|
+
envs.SGLANG_TEST_RETRACT.clear()
|
256
|
+
assert envs.SGLANG_TEST_RETRACT.value is False
|
257
|
+
|
258
|
+
envs.SGLANG_TEST_RETRACT.set(None)
|
259
|
+
assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
|
260
|
+
|
261
|
+
envs.SGLANG_TEST_RETRACT.clear()
|
262
|
+
assert not envs.SGLANG_TEST_RETRACT.is_set()
|
263
|
+
|
264
|
+
envs.SGLANG_TEST_RETRACT.set(True)
|
265
|
+
assert envs.SGLANG_TEST_RETRACT.value is True
|
266
|
+
|
267
|
+
with envs.SGLANG_TEST_RETRACT.override(None):
|
268
|
+
assert (
|
269
|
+
envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
|
270
|
+
)
|
271
|
+
|
272
|
+
assert envs.SGLANG_TEST_RETRACT.value is True
|
273
|
+
|
274
|
+
envs.SGLANG_TEST_RETRACT.set(None)
|
275
|
+
with envs.SGLANG_TEST_RETRACT.override(True):
|
276
|
+
assert envs.SGLANG_TEST_RETRACT.value is True
|
277
|
+
|
278
|
+
assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
|
279
|
+
|
280
|
+
example_with_exit_stack()
|
281
|
+
example_with_subprocess()
|
282
|
+
|
283
|
+
|
284
|
+
if __name__ == "__main__":
|
285
|
+
examples()
|
sglang/srt/eplb/eplb_manager.py
CHANGED
@@ -55,7 +55,7 @@ class EPLBManager:
|
|
55
55
|
enable_timing = self._rebalance_layers_per_chunk is None
|
56
56
|
|
57
57
|
if enable_timing:
|
58
|
-
torch.
|
58
|
+
torch.get_device_module().synchronize()
|
59
59
|
time_start = time.time()
|
60
60
|
|
61
61
|
dump_record_output = get_global_expert_distribution_recorder().dump_record(
|
@@ -85,7 +85,7 @@ class EPLBManager:
|
|
85
85
|
|
86
86
|
msg = f"[EPLBManager] rebalance end"
|
87
87
|
if enable_timing:
|
88
|
-
torch.
|
88
|
+
torch.get_device_module().synchronize()
|
89
89
|
time_end = time.time()
|
90
90
|
msg += f" time={time_end - time_start:.3f}s"
|
91
91
|
logger.info(msg)
|
@@ -11,6 +11,9 @@
|
|
11
11
|
# See the License for the specific language governing permissions and
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
14
17
|
import logging
|
15
18
|
import math
|
16
19
|
import os
|
@@ -19,16 +22,20 @@ from abc import ABC
|
|
19
22
|
from collections import deque
|
20
23
|
from contextlib import contextmanager
|
21
24
|
from pathlib import Path
|
22
|
-
from typing import Any, Dict, List, Literal, Optional, Tuple, Type
|
25
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
|
23
26
|
|
24
27
|
import einops
|
25
28
|
import torch
|
26
29
|
import torch.distributed
|
27
30
|
|
28
|
-
from sglang.srt.eplb.expert_location import ExpertLocationMetadata
|
29
31
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
30
32
|
from sglang.srt.server_args import ServerArgs
|
31
|
-
from sglang.srt.utils import Withable, get_bool_env_var
|
33
|
+
from sglang.srt.utils import Withable, get_bool_env_var, is_npu
|
34
|
+
|
35
|
+
_is_npu = is_npu()
|
36
|
+
|
37
|
+
if TYPE_CHECKING:
|
38
|
+
from sglang.srt.eplb.expert_location import ExpertLocationMetadata
|
32
39
|
|
33
40
|
logger = logging.getLogger(__name__)
|
34
41
|
|
@@ -43,7 +50,7 @@ class ExpertDistributionRecorder(ABC):
|
|
43
50
|
@staticmethod
|
44
51
|
def init_new(
|
45
52
|
server_args: ServerArgs,
|
46
|
-
expert_location_metadata:
|
53
|
+
expert_location_metadata: ExpertLocationMetadata,
|
47
54
|
rank: int,
|
48
55
|
):
|
49
56
|
if server_args.expert_distribution_recorder_mode is not None:
|
@@ -118,7 +125,7 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
|
118
125
|
def __init__(
|
119
126
|
self,
|
120
127
|
server_args: ServerArgs,
|
121
|
-
expert_location_metadata:
|
128
|
+
expert_location_metadata: ExpertLocationMetadata,
|
122
129
|
rank: int,
|
123
130
|
):
|
124
131
|
self._server_args = server_args
|
@@ -211,7 +218,9 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
|
|
211
218
|
def _on_hook(self, hook_name: str, **kwargs):
|
212
219
|
if self._disable_all:
|
213
220
|
return
|
214
|
-
if not (
|
221
|
+
if not (
|
222
|
+
self._recording or torch.get_device_module().is_current_stream_capturing()
|
223
|
+
):
|
215
224
|
return
|
216
225
|
gatherer = self._single_pass_gatherers[
|
217
226
|
self._accumulator.get_single_pass_gatherer_key(
|
@@ -279,7 +288,7 @@ class _SinglePassGatherer(ABC):
|
|
279
288
|
@staticmethod
|
280
289
|
def init_new(
|
281
290
|
server_args: ServerArgs,
|
282
|
-
expert_location_metadata:
|
291
|
+
expert_location_metadata: ExpertLocationMetadata,
|
283
292
|
rank: int,
|
284
293
|
) -> "_SinglePassGatherer":
|
285
294
|
if server_args.expert_distribution_recorder_mode == "per_token":
|
@@ -307,7 +316,7 @@ class _SinglePassGatherer(ABC):
|
|
307
316
|
|
308
317
|
return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
|
309
318
|
|
310
|
-
def __init__(self, expert_location_metadata:
|
319
|
+
def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
|
311
320
|
self._expert_location_metadata = expert_location_metadata
|
312
321
|
self._rank = rank
|
313
322
|
|
@@ -346,7 +355,7 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
|
|
346
355
|
def __init__(
|
347
356
|
self,
|
348
357
|
server_args: ServerArgs,
|
349
|
-
expert_location_metadata:
|
358
|
+
expert_location_metadata: ExpertLocationMetadata,
|
350
359
|
rank: int,
|
351
360
|
):
|
352
361
|
super().__init__(expert_location_metadata, rank)
|
@@ -446,6 +455,10 @@ def _list_sum(a: List, b: List) -> List:
|
|
446
455
|
class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
|
447
456
|
def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
|
448
457
|
super().__init__(*args, **kwargs)
|
458
|
+
if not _is_npu:
|
459
|
+
device = "cuda"
|
460
|
+
else:
|
461
|
+
device = "npu"
|
449
462
|
self._enable_global_physical_experts = enable_global_physical_experts
|
450
463
|
self._data = torch.zeros(
|
451
464
|
(
|
@@ -457,7 +470,7 @@ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
|
|
457
470
|
),
|
458
471
|
),
|
459
472
|
dtype=torch.int,
|
460
|
-
device=
|
473
|
+
device=device,
|
461
474
|
)
|
462
475
|
|
463
476
|
def reset(self):
|
@@ -561,7 +574,7 @@ class _Accumulator(ABC):
|
|
561
574
|
@staticmethod
|
562
575
|
def init_new(
|
563
576
|
server_args: ServerArgs,
|
564
|
-
expert_location_metadata:
|
577
|
+
expert_location_metadata: ExpertLocationMetadata,
|
565
578
|
rank: int,
|
566
579
|
) -> "_Accumulator":
|
567
580
|
return _Accumulator.get_class(server_args)(
|
@@ -580,7 +593,7 @@ class _Accumulator(ABC):
|
|
580
593
|
def __init__(
|
581
594
|
self,
|
582
595
|
server_args: ServerArgs,
|
583
|
-
expert_location_metadata:
|
596
|
+
expert_location_metadata: ExpertLocationMetadata,
|
584
597
|
rank: int,
|
585
598
|
):
|
586
599
|
self._server_args = server_args
|
@@ -779,7 +792,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
|
779
792
|
|
780
793
|
if self._first_dump:
|
781
794
|
self._first_dump = False
|
782
|
-
torch.
|
795
|
+
torch.get_device_module().empty_cache()
|
783
796
|
|
784
797
|
torch.distributed.all_reduce(
|
785
798
|
logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
|
@@ -11,21 +11,26 @@
|
|
11
11
|
# See the License for the specific language governing permissions and
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
14
17
|
import json
|
15
18
|
import logging
|
16
19
|
import random
|
17
20
|
from dataclasses import dataclass
|
18
21
|
from pathlib import Path
|
19
|
-
from typing import List, Optional
|
22
|
+
from typing import TYPE_CHECKING, List, Optional
|
20
23
|
|
21
24
|
import torch
|
22
25
|
import torch.distributed
|
23
26
|
import torch.nn.functional as F
|
24
27
|
|
25
|
-
from sglang.srt.configs.model_config import ModelConfig
|
26
28
|
from sglang.srt.eplb import eplb_algorithms
|
27
29
|
from sglang.srt.model_loader import get_model_architecture
|
28
|
-
|
30
|
+
|
31
|
+
if TYPE_CHECKING:
|
32
|
+
from sglang.srt.configs.model_config import ModelConfig
|
33
|
+
from sglang.srt.server_args import ServerArgs
|
29
34
|
|
30
35
|
logger = logging.getLogger(__name__)
|
31
36
|
|
@@ -226,6 +231,7 @@ class ExpertLocationMetadata:
|
|
226
231
|
logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
|
227
232
|
logical_to_rank_dispatch_physical_map=(
|
228
233
|
compute_logical_to_rank_dispatch_physical_map(
|
234
|
+
server_args=server_args,
|
229
235
|
logical_to_all_physical_map=logical_to_all_physical_map,
|
230
236
|
num_gpus=ep_size,
|
231
237
|
num_physical_experts=num_physical_experts,
|
@@ -335,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
|
|
335
341
|
|
336
342
|
# TODO optimize performance (rewrite and/or run in separate process with overlap)
|
337
343
|
def compute_logical_to_rank_dispatch_physical_map(
|
344
|
+
server_args: ServerArgs,
|
338
345
|
logical_to_all_physical_map: torch.Tensor,
|
339
346
|
num_gpus: int,
|
340
347
|
num_physical_experts: int,
|
@@ -343,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
|
|
343
350
|
):
|
344
351
|
r = random.Random(seed)
|
345
352
|
|
346
|
-
|
353
|
+
num_local_gpu_physical_experts = num_physical_experts // num_gpus
|
354
|
+
num_gpus_per_node = server_args.ep_size // server_args.nnodes
|
355
|
+
num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
|
347
356
|
num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
|
348
357
|
dtype = logical_to_all_physical_map.dtype
|
349
358
|
|
@@ -367,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
|
|
367
376
|
physical_expert_id
|
368
377
|
for physical_expert_id in candidate_physical_expert_ids
|
369
378
|
if _compute_gpu_id_of_physical_expert(
|
370
|
-
physical_expert_id,
|
379
|
+
physical_expert_id, num_local_gpu_physical_experts
|
371
380
|
)
|
372
381
|
== gpu_id
|
373
382
|
]
|
374
383
|
if len(same_gpu_physical_expert_ids) > 0:
|
384
|
+
# 1. Prefer same-GPU experts
|
375
385
|
output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
|
376
|
-
|
386
|
+
else:
|
387
|
+
# 2. Otherwise, prefer same-node experts
|
388
|
+
node_id = gpu_id // num_gpus_per_node
|
389
|
+
same_node_physical_expert_ids = [
|
390
|
+
physical_expert_id
|
391
|
+
for physical_expert_id in candidate_physical_expert_ids
|
392
|
+
if _compute_node_id_of_physical_expert(
|
393
|
+
physical_expert_id, num_local_node_physical_experts
|
394
|
+
)
|
395
|
+
== node_id
|
396
|
+
]
|
397
|
+
if len(same_node_physical_expert_ids) > 0:
|
398
|
+
output_partial[gpu_id] = same_node_physical_expert_ids[0]
|
399
|
+
|
400
|
+
# 3. Fill remaining slots with fair random choices
|
377
401
|
num_remain = torch.sum(output_partial == -1).item()
|
378
402
|
output_partial[output_partial == -1] = torch.tensor(
|
379
403
|
_fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
|
@@ -399,9 +423,15 @@ def _logical_to_all_physical_raw(
|
|
399
423
|
|
400
424
|
|
401
425
|
def _compute_gpu_id_of_physical_expert(
|
402
|
-
physical_expert_id: int,
|
426
|
+
physical_expert_id: int, num_local_gpu_physical_experts: int
|
427
|
+
) -> int:
|
428
|
+
return physical_expert_id // num_local_gpu_physical_experts
|
429
|
+
|
430
|
+
|
431
|
+
def _compute_node_id_of_physical_expert(
|
432
|
+
physical_expert_id: int, num_local_host_physical_experts: int
|
403
433
|
) -> int:
|
404
|
-
return physical_expert_id //
|
434
|
+
return physical_expert_id // num_local_host_physical_experts
|
405
435
|
|
406
436
|
|
407
437
|
def _fair_choices(arr: List, k: int, r: random.Random) -> List:
|
@@ -47,7 +47,7 @@ class ExpertLocationUpdater:
|
|
47
47
|
):
|
48
48
|
if self._first_execution:
|
49
49
|
self._first_execution = False
|
50
|
-
torch.
|
50
|
+
torch.get_device_module().empty_cache()
|
51
51
|
|
52
52
|
old_expert_location_metadata = get_global_expert_location_metadata()
|
53
53
|
assert old_expert_location_metadata is not None
|
@@ -162,12 +162,9 @@ class BaseFormatDetector(ABC):
|
|
162
162
|
|
163
163
|
try:
|
164
164
|
try:
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
self.tool_call_separator + self.bot_token
|
169
|
-
):
|
170
|
-
start_idx = len(self.tool_call_separator + self.bot_token)
|
165
|
+
tool_call_pos = current_text.find(self.bot_token)
|
166
|
+
if tool_call_pos != -1:
|
167
|
+
start_idx = tool_call_pos + len(self.bot_token)
|
171
168
|
elif self.current_tool_id > 0 and current_text.startswith(
|
172
169
|
self.tool_call_separator
|
173
170
|
):
|