PyPI - sglang - Versions diffs - 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl - Mend

sglang 0.5.2rc2py3-none-any.whl → 0.5.3.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (396) hide show

sglang/bench_one_batch.py +7 -11
sglang/bench_one_batch_server.py +330 -31
sglang/bench_serving.py +474 -142
sglang/compile_deep_gemm.py +3 -0
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +10 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/falcon_h1.py +314 -0
sglang/srt/configs/load_config.py +9 -0
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +228 -92
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/qwen3_next.py +294 -0
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +49 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +30 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +21 -6
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +279 -108
sglang/srt/disaggregation/decode.py +78 -37
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +55 -537
sglang/srt/disaggregation/nixl/conn.py +373 -68
sglang/srt/disaggregation/prefill.py +53 -49
sglang/srt/disaggregation/utils.py +40 -54
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +156 -80
sglang/srt/entrypoints/engine.py +59 -18
sglang/srt/entrypoints/grpc_request_manager.py +842 -0
sglang/srt/entrypoints/grpc_server.py +950 -0
sglang/srt/entrypoints/http_server.py +179 -60
sglang/srt/entrypoints/openai/protocol.py +265 -29
sglang/srt/entrypoints/openai/serving_base.py +65 -3
sglang/srt/entrypoints/openai/serving_chat.py +213 -122
sglang/srt/entrypoints/openai/serving_completions.py +14 -3
sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +48 -3
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +289 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +38 -8
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +17 -8
sglang/srt/function_call/glm4_moe_detector.py +4 -4
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
sglang/srt/layers/activation.py +143 -9
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +115 -9
sglang/srt/layers/attention/attention_registry.py +215 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +40 -8
sglang/srt/layers/attention/flashinfer_backend.py +341 -204
sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
sglang/srt/layers/attention/mamba/mamba.py +577 -0
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +57 -7
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +41 -2
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +34 -15
sglang/srt/layers/linear.py +55 -7
sglang/srt/layers/logits_processor.py +180 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
sglang/srt/layers/moe/ep_moe/layer.py +248 -333
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +83 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +29 -7
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +155 -60
sglang/srt/layers/quantization/fp8_utils.py +51 -32
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +191 -56
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +74 -42
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +28 -33
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +91 -41
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +213 -21
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +99 -5
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +8 -3
sglang/srt/lora/lora_manager.py +44 -118
sglang/srt/lora/mem_pool.py +25 -11
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +22 -11
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +199 -301
sglang/srt/managers/data_parallel_controller.py +115 -80
sglang/srt/managers/detokenizer_manager.py +19 -15
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +340 -109
sglang/srt/managers/mm_utils.py +44 -6
sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +55 -0
sglang/srt/managers/schedule_batch.py +343 -212
sglang/srt/managers/schedule_policy.py +145 -18
sglang/srt/managers/scheduler.py +653 -273
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
sglang/srt/managers/tokenizer_manager.py +579 -674
sglang/srt/managers/tp_worker.py +96 -26
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +9 -2
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +43 -24
sglang/srt/mem_cache/hiradix_cache.py +222 -75
sglang/srt/mem_cache/memory_pool.py +651 -80
sglang/srt/mem_cache/memory_pool_host.py +239 -228
sglang/srt/mem_cache/radix_cache.py +227 -73
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
sglang/srt/mem_cache/swa_radix_cache.py +93 -48
sglang/srt/metrics/collector.py +511 -132
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +52 -37
sglang/srt/model_executor/forward_batch_info.py +74 -46
sglang/srt/model_executor/model_runner.py +455 -176
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/__init__.py +10 -4
sglang/srt/model_loader/loader.py +319 -10
sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/model_loader/weight_utils.py +161 -3
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +820 -217
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +607 -130
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/falcon_h1.py +578 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -2
sglang/srt/models/glm4_moe.py +4 -4
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +5 -3
sglang/srt/models/glm4v_moe.py +4 -1
sglang/srt/models/gpt_oss.py +8 -31
sglang/srt/models/grok.py +5 -13
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +3 -3
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mixtral.py +1 -3
sglang/srt/models/mllama4.py +50 -4
sglang/srt/models/nemotron_h.py +514 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2_5_vl.py +29 -5
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +120 -13
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +32 -4
sglang/srt/models/qwen3_next.py +1069 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +55 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +98 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +153 -129
sglang/srt/multimodal/processors/qwen_vl.py +23 -6
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/offloader.py +27 -3
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +49 -26
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +1051 -285
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
sglang/srt/speculative/eagle_worker.py +98 -29
sglang/srt/speculative/ngram_info.py +428 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +52 -0
sglang/srt/speculative/spec_utils.py +605 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +578 -0
sglang/srt/two_batch_overlap.py +9 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +451 -77
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +2 -2
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/get_logits_ut.py +57 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +119 -11
sglang/test/runners.py +5 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_longbench_v2.py +332 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_cutlass_w4a8_moe.py +9 -19
sglang/test/test_deterministic.py +313 -0
sglang/test/test_deterministic_utils.py +81 -0
sglang/test/test_disaggregation_utils.py +140 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +407 -8
sglang/utils.py +21 -1
sglang/version.py +1 -1
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/openai/serving_responses.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Adapted from vLLM's OpenAIServingResponses
 """Handler for /v1/responses requests"""
+from __future__ import annotations
 import asyncio
 import copy
@@ -9,7 +10,7 @@ import logging
 import time
 from contextlib import AsyncExitStack
 from http import HTTPStatus
-from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
+from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
 import jinja2
 import openai.types.responses as openai_responses_types
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
 from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
 from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
 from sglang.srt.managers.io_struct import GenerateReqInput
-from sglang.srt.managers.template_manager import TemplateManager
-from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.parser.reasoning_parser import ReasoningParser
 from sglang.srt.utils import random_uuid
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
 logger = logging.getLogger(__name__)
@@ -120,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
         self.background_tasks: dict[str, asyncio.Task] = {}
+    # error helpers dedicated for v1/responses
+    def create_error_response(
+        self,
+        message: str,
+        err_type: str = "invalid_request_error",
+        status_code: int = 400,
+        param: Optional[str] = None,
+    ) -> ORJSONResponse:
+        nested_error = {
+            "message": message,
+            "type": err_type,
+            "param": param,
+            "code": status_code,
+        }
+        return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
+    def create_streaming_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+    ) -> str:
+        return json.dumps(
+            {
+                "error": {
+                    "message": message,
+                    "type": err_type,
+                    "param": None,
+                    "code": status_code,
+                }
+            }
+        )
     def _request_id_prefix(self) -> str:
         return "resp_"
@@ -242,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
                         sampling_params=sampling_params,
                         stream=request.stream,
                         rid=request.request_id,
+                        extra_key=self._compute_extra_key(request),
                         background=request.background,
                     )
@@ -830,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
         async for ctx in result_generator:
+            # Only process context objects that implement the `is_expecting_start()` method,
+            # which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
+            # Contexts without this method are skipped, as they do not represent a new turn
+            # or are not compatible with per-turn handling in the /v1/responses endpoint.
+            if not hasattr(ctx, "is_expecting_start"):
+                continue
             if ctx.is_expecting_start():
                 current_output_index += 1
                 sent_output_item_added = False
@@ -1247,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
                 sampling_params=sampling_params,
                 stream=adapted_request.stream,
                 rid=request_id,
+                extra_key=adapted_request.extra_key,
                 return_logprob=adapted_request.return_logprob,
                 logprob_start_len=adapted_request.logprob_start_len,
                 top_logprobs_num=adapted_request.top_logprobs_num,

sglang/srt/entrypoints/openai/serving_score.py CHANGED Viewed

@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
     def _convert_to_internal_request(
         self,
         request: ScoringRequest,
+        raw_request: Request = None,
     ) -> tuple[ScoringRequest, ScoringRequest]:
         """Convert OpenAI scoring request to internal format"""
         # For scoring, we pass the request directly as the tokenizer_manager

sglang/srt/entrypoints/openai/serving_tokenize.py ADDED Viewed

@@ -0,0 +1,144 @@
+import logging
+from http import HTTPStatus
+from typing import List, Union
+from fastapi import Request
+from sglang.srt.entrypoints.openai.protocol import (
+    DetokenizeRequest,
+    DetokenizeResponse,
+    ErrorResponse,
+    TokenizeRequest,
+    TokenizeResponse,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+logger = logging.getLogger(__name__)
+class OpenAIServingTokenize(OpenAIServingBase):
+    """Handler for /v1/tokenize requests"""
+    def _request_id_prefix(self) -> str:
+        return "tok-"
+    def _convert_to_internal_request(
+        self, request: TokenizeRequest, raw_request: Request
+    ) -> tuple[TokenizeRequest, TokenizeRequest]:
+        return request, request
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: TokenizeRequest,
+        request: TokenizeRequest,
+        raw_request: Request,
+    ) -> Union[TokenizeResponse, ErrorResponse]:
+        try:
+            tokenizer = self.tokenizer_manager.tokenizer
+            max_model_len = getattr(tokenizer, "model_max_length", -1)
+            if isinstance(request.prompt, str):
+                token_ids = tokenizer.encode(
+                    request.prompt,
+                    add_special_tokens=request.add_special_tokens,
+                )
+                tokens = token_ids
+                count = len(token_ids)
+            elif isinstance(request.prompt, list):
+                token_ids_list = [
+                    tokenizer.encode(
+                        text, add_special_tokens=request.add_special_tokens
+                    )
+                    for text in request.prompt
+                ]
+                tokens = token_ids_list
+                count = [len(ids) for ids in token_ids_list]
+            else:
+                return self.create_error_response(
+                    f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
+                )
+            return TokenizeResponse(
+                tokens=tokens, count=count, max_model_len=max_model_len
+            )
+        except Exception as e:
+            logger.error("Error during tokenization", exc_info=True)
+            return self.create_error_response(
+                f"Internal server error during tokenization: {e}",
+                err_type="InternalServerError",
+                status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+            )
+class OpenAIServingDetokenize(OpenAIServingBase):
+    """Handler for /v1/detokenize requests"""
+    def _request_id_prefix(self) -> str:
+        return "detok-"
+    def _convert_to_internal_request(
+        self, request: DetokenizeRequest, raw_request: Request
+    ) -> tuple[DetokenizeRequest, DetokenizeRequest]:
+        return request, request
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: DetokenizeRequest,
+        request: DetokenizeRequest,
+        raw_request: Request,
+    ) -> Union[DetokenizeResponse, ErrorResponse]:
+        try:
+            tokenizer = self.tokenizer_manager.tokenizer
+            if (
+                isinstance(request.tokens, list)
+                and request.tokens
+                and isinstance(request.tokens[0], int)
+            ):
+                if not all(isinstance(t, int) for t in request.tokens):
+                    return self.create_error_response(
+                        "Invalid input: 'tokens' must be a list of integers."
+                    )
+                tokens_to_decode = [int(t) for t in request.tokens]
+                text = tokenizer.decode(
+                    tokens_to_decode, skip_special_tokens=request.skip_special_tokens
+                )
+                text_out: Union[str, List[str]] = text
+            elif (
+                isinstance(request.tokens, list)
+                and request.tokens
+                and isinstance(request.tokens[0], list)
+            ):
+                texts: List[str] = []
+                for token_list in request.tokens:
+                    if not all(isinstance(t, int) for t in token_list):
+                        return self.create_error_response(
+                            f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
+                        )
+                    decoded_text = tokenizer.decode(
+                        [int(t) for t in token_list],
+                        skip_special_tokens=request.skip_special_tokens,
+                    )
+                    texts.append(decoded_text)
+                text_out = texts
+            elif isinstance(request.tokens, list) and not request.tokens:
+                text_out = ""
+            else:
+                return self.create_error_response(
+                    f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
+                )
+            return DetokenizeResponse(text=text_out)
+        except Exception as e:
+            logger.error("Error during detokenization", exc_info=True)
+            if "decode" in str(e).lower():
+                return self.create_error_response(
+                    f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
+                    err_type="DecodeError",
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
+            return self.create_error_response(
+                f"Internal server error during detokenization: {e}",
+                err_type="InternalServerError",
+                status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+            )

sglang/srt/environ.py ADDED Viewed

@@ -0,0 +1,289 @@
+import os
+import subprocess
+import warnings
+from contextlib import ExitStack, contextmanager
+from typing import Any
+class EnvField:
+    def __init__(self, default: Any):
+        self.default = default
+        # NOTE: we use None to indicate whether the value is set or not
+        # If the value is manually set to None, we need mark it as _set_to_none.
+        # Always use clear() to reset the value, which leads to the default fallback.
+        self._set_to_none = False
+    def __set_name__(self, owner, name):
+        self.name = name
+    def parse(self, value: str) -> Any:
+        raise NotImplementedError()
+    def get(self) -> Any:
+        value = os.getenv(self.name)
+        if self._set_to_none:
+            assert value is None
+            return None
+        if value is None:
+            return self.default
+        try:
+            return self.parse(value)
+        except ValueError as e:
+            warnings.warn(
+                f'Invalid value for {self.name}: {e}, using default "{self.default}"'
+            )
+            return self.default
+    def is_set(self):
+        # NOTE: If None is manually set, it is considered as set.
+        return self.name in os.environ or self._set_to_none
+    def get_set_value_or(self, or_value: Any):
+        # NOTE: Ugly usage, but only way to get custom default value.
+        return self.get() if self.is_set() else or_value
+    def set(self, value: Any):
+        if value is None:
+            self._set_to_none = True
+            os.environ.pop(self.name, None)
+        else:
+            self._set_to_none = False
+            os.environ[self.name] = str(value)
+    @contextmanager
+    def override(self, value: Any):
+        backup_present = self.name in os.environ
+        backup_value = os.environ.get(self.name)
+        backup_set_to_none = self._set_to_none
+        self.set(value)
+        yield
+        if backup_present:
+            os.environ[self.name] = backup_value
+        else:
+            os.environ.pop(self.name, None)
+        self._set_to_none = backup_set_to_none
+    def clear(self):
+        os.environ.pop(self.name, None)
+        self._set_to_none = False
+    @property
+    def value(self):
+        return self.get()
+class EnvStr(EnvField):
+    def parse(self, value: str) -> str:
+        return value
+class EnvBool(EnvField):
+    def parse(self, value: str) -> bool:
+        value = value.lower()
+        if value in ["true", "1", "yes", "y"]:
+            return True
+        if value in ["false", "0", "no", "n"]:
+            return False
+        raise ValueError(f'"{value}" is not a valid boolean value')
+class EnvInt(EnvField):
+    def parse(self, value: str) -> int:
+        try:
+            return int(value)
+        except ValueError:
+            raise ValueError(f'"{value}" is not a valid integer value')
+class EnvFloat(EnvField):
+    def parse(self, value: str) -> float:
+        try:
+            return float(value)
+        except ValueError:
+            raise ValueError(f'"{value}" is not a valid float value')
+class Envs:
+    # fmt: off
+    # Model & File Download
+    SGLANG_USE_MODELSCOPE = EnvBool(False)
+    # Test & Debug
+    SGLANG_IS_IN_CI = EnvBool(False)
+    SGLANG_AMD_CI = EnvBool(False)
+    SGLANG_TEST_RETRACT = EnvBool(False)
+    SGLANG_SET_CPU_AFFINITY = EnvBool(False)
+    SGLANG_PROFILE_WITH_STACK = EnvBool(True)
+    SGLANG_RECORD_STEP_TIME = EnvBool(False)
+    SGLANG_GC_LOG = EnvBool(False)
+    SGLANG_FORCE_SHUTDOWN = EnvBool(False)
+    SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
+    SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
+    SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
+    SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
+    SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
+    SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
+    # Test: pd-disaggregation
+    SGLANG_TEST_PD_DISAGG_BACKEND = EnvStr("mooncake")
+    SGLANG_TEST_PD_DISAGG_DEVICES = EnvStr(None)
+    # Model Parallel
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
+    # Constrained Decoding
+    SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
+    SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
+    # Hi-Cache
+    SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
+    # Mooncake KV Transfer
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
+    ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
+    # AMD & ROCm
+    SGLANG_USE_AITER = EnvBool(False)
+    SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
+    # Quantization
+    SGLANG_INT4_WEIGHT = EnvBool(False)
+    SGLANG_CPU_QUANTIZATION = EnvBool(False)
+    SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
+    SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
+    # Flashinfer
+    SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
+    SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
+    # Triton
+    SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
+    # Torch Compile
+    SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
+    # EPLB
+    SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
+    SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
+    SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
+    SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
+    # TBO
+    SGLANG_TBO_DEBUG = EnvBool(False)
+    # DeepGemm
+    SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
+    SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
+    SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
+    SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
+    SGLANG_DG_USE_NVRTC = EnvBool(False)
+    SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
+    # sgl-kernel
+    SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
+    # vLLM dependencies
+    USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
+    USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
+    USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
+    RETURN_ORIGINAL_LOGPROB = EnvBool(False)
+    SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
+    SGLANG_MOE_PADDING = EnvBool(False)
+    SGLANG_CUTLASS_MOE = EnvBool(False)
+    HF_HUB_DISABLE_XET = EnvBool(False)
+    DISABLE_OPENAPI_DOC = EnvBool(False)
+    SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
+    SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
+    SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
+    SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
+    SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
+    # Deterministic inference
+    SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
+    SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
+    SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
+    SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
+    SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
+    # fmt: on
+envs = Envs()
+def _convert_SGL_to_SGLANG():
+    for key, value in os.environ.items():
+        if key.startswith("SGL_"):
+            new_key = key.replace("SGL_", "SGLANG_", 1)
+            warnings.warn(
+                f"Environment variable {key} is deprecated, please use {new_key}"
+            )
+            os.environ[new_key] = value
+_convert_SGL_to_SGLANG()
+def example_with_exit_stack():
+    # Use this style of context manager in unit test
+    exit_stack = ExitStack()
+    exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
+    assert envs.SGLANG_TEST_RETRACT.value is False
+    exit_stack.close()
+    assert envs.SGLANG_TEST_RETRACT.value is None
+def example_with_subprocess():
+    command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
+    with envs.SGLANG_TEST_RETRACT.override(True):
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        process.wait()
+        output = process.stdout.read().decode("utf-8").strip()
+        assert output == "True"
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output = process.stdout.read().decode("utf-8").strip()
+    assert output == "None"
+def examples():
+    # Example usage for envs
+    envs.SGLANG_TEST_RETRACT.clear()
+    assert envs.SGLANG_TEST_RETRACT.value is False
+    envs.SGLANG_TEST_RETRACT.set(None)
+    assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
+    envs.SGLANG_TEST_RETRACT.clear()
+    assert not envs.SGLANG_TEST_RETRACT.is_set()
+    envs.SGLANG_TEST_RETRACT.set(True)
+    assert envs.SGLANG_TEST_RETRACT.value is True
+    with envs.SGLANG_TEST_RETRACT.override(None):
+        assert (
+            envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
+        )
+    assert envs.SGLANG_TEST_RETRACT.value is True
+    envs.SGLANG_TEST_RETRACT.set(None)
+    with envs.SGLANG_TEST_RETRACT.override(True):
+        assert envs.SGLANG_TEST_RETRACT.value is True
+    assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
+    example_with_exit_stack()
+    example_with_subprocess()
+if __name__ == "__main__":
+    examples()

sglang/srt/eplb/eplb_manager.py CHANGED Viewed

@@ -55,7 +55,7 @@ class EPLBManager:
         enable_timing = self._rebalance_layers_per_chunk is None
         if enable_timing:
-            torch.cuda.synchronize()
+            torch.get_device_module().synchronize()
             time_start = time.time()
         dump_record_output = get_global_expert_distribution_recorder().dump_record(
@@ -85,7 +85,7 @@ class EPLBManager:
         msg = f"[EPLBManager] rebalance end"
         if enable_timing:
-            torch.cuda.synchronize()
+            torch.get_device_module().synchronize()
             time_end = time.time()
             msg += f" time={time_end - time_start:.3f}s"
         logger.info(msg)

sglang/srt/eplb/expert_distribution.py CHANGED Viewed

@@ -11,6 +11,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from __future__ import annotations
 import logging
 import math
 import os
@@ -19,16 +22,20 @@ from abc import ABC
 from collections import deque
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
 import einops
 import torch
 import torch.distributed
-from sglang.srt.eplb.expert_location import ExpertLocationMetadata
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import Withable, get_bool_env_var
+from sglang.srt.utils import Withable, get_bool_env_var, is_npu
+_is_npu = is_npu()
+if TYPE_CHECKING:
+    from sglang.srt.eplb.expert_location import ExpertLocationMetadata
 logger = logging.getLogger(__name__)
@@ -43,7 +50,7 @@ class ExpertDistributionRecorder(ABC):
     @staticmethod
     def init_new(
         server_args: ServerArgs,
-        expert_location_metadata: "ExpertLocationMetadata",
+        expert_location_metadata: ExpertLocationMetadata,
         rank: int,
     ):
         if server_args.expert_distribution_recorder_mode is not None:
@@ -118,7 +125,7 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
     def __init__(
         self,
         server_args: ServerArgs,
-        expert_location_metadata: "ExpertLocationMetadata",
+        expert_location_metadata: ExpertLocationMetadata,
         rank: int,
     ):
         self._server_args = server_args
@@ -211,7 +218,9 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
     def _on_hook(self, hook_name: str, **kwargs):
         if self._disable_all:
             return
-        if not (self._recording or torch.cuda.is_current_stream_capturing()):
+        if not (
+            self._recording or torch.get_device_module().is_current_stream_capturing()
+        ):
             return
         gatherer = self._single_pass_gatherers[
             self._accumulator.get_single_pass_gatherer_key(
@@ -279,7 +288,7 @@ class _SinglePassGatherer(ABC):
     @staticmethod
     def init_new(
         server_args: ServerArgs,
-        expert_location_metadata: "ExpertLocationMetadata",
+        expert_location_metadata: ExpertLocationMetadata,
         rank: int,
     ) -> "_SinglePassGatherer":
         if server_args.expert_distribution_recorder_mode == "per_token":
@@ -307,7 +316,7 @@ class _SinglePassGatherer(ABC):
         return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
-    def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
+    def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
         self._expert_location_metadata = expert_location_metadata
         self._rank = rank
@@ -346,7 +355,7 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
     def __init__(
         self,
         server_args: ServerArgs,
-        expert_location_metadata: "ExpertLocationMetadata",
+        expert_location_metadata: ExpertLocationMetadata,
         rank: int,
     ):
         super().__init__(expert_location_metadata, rank)
@@ -446,6 +455,10 @@ def _list_sum(a: List, b: List) -> List:
 class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
     def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
         super().__init__(*args, **kwargs)
+        if not _is_npu:
+            device = "cuda"
+        else:
+            device = "npu"
         self._enable_global_physical_experts = enable_global_physical_experts
         self._data = torch.zeros(
             (
@@ -457,7 +470,7 @@ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
                 ),
             ),
             dtype=torch.int,
-            device="cuda",
+            device=device,
         )
     def reset(self):
@@ -561,7 +574,7 @@ class _Accumulator(ABC):
     @staticmethod
     def init_new(
         server_args: ServerArgs,
-        expert_location_metadata: "ExpertLocationMetadata",
+        expert_location_metadata: ExpertLocationMetadata,
         rank: int,
     ) -> "_Accumulator":
         return _Accumulator.get_class(server_args)(
@@ -580,7 +593,7 @@ class _Accumulator(ABC):
     def __init__(
         self,
         server_args: ServerArgs,
-        expert_location_metadata: "ExpertLocationMetadata",
+        expert_location_metadata: ExpertLocationMetadata,
         rank: int,
     ):
         self._server_args = server_args
@@ -779,7 +792,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
         if self._first_dump:
             self._first_dump = False
-            torch.cuda.empty_cache()
+            torch.get_device_module().empty_cache()
         torch.distributed.all_reduce(
             logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM

sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

sglang 0.5.2rc2py3-none-any.whl → 0.5.3.post1py3-none-any.whl