PyPI - sglang - Versions diffs - 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.2rc2py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (377) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +330 -31
sglang/bench_serving.py +267 -32
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +9 -0
sglang/srt/configs/model_config.py +181 -82
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +49 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +30 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +21 -6
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +279 -108
sglang/srt/disaggregation/decode.py +71 -19
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +55 -537
sglang/srt/disaggregation/nixl/conn.py +326 -53
sglang/srt/disaggregation/prefill.py +36 -17
sglang/srt/disaggregation/utils.py +40 -54
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +156 -80
sglang/srt/entrypoints/engine.py +59 -18
sglang/srt/entrypoints/grpc_request_manager.py +855 -0
sglang/srt/entrypoints/grpc_server.py +810 -0
sglang/srt/entrypoints/http_server.py +130 -59
sglang/srt/entrypoints/openai/protocol.py +112 -4
sglang/srt/entrypoints/openai/serving_base.py +65 -3
sglang/srt/entrypoints/openai/serving_chat.py +204 -55
sglang/srt/entrypoints/openai/serving_completions.py +14 -3
sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +48 -3
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +38 -8
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +9 -2
sglang/srt/function_call/glm4_moe_detector.py +4 -4
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
sglang/srt/layers/activation.py +143 -9
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +115 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +118 -198
sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
sglang/srt/layers/attention/mamba/mamba.py +629 -0
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +57 -7
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +41 -2
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +34 -15
sglang/srt/layers/linear.py +55 -7
sglang/srt/layers/logits_processor.py +44 -12
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
sglang/srt/layers/moe/ep_moe/layer.py +256 -63
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +22 -6
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +78 -49
sglang/srt/layers/quantization/fp8_utils.py +51 -32
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +190 -55
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +74 -42
sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +26 -17
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +91 -41
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +213 -21
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +99 -5
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +8 -3
sglang/srt/lora/lora_manager.py +52 -118
sglang/srt/lora/mem_pool.py +25 -11
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +22 -11
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +199 -301
sglang/srt/managers/data_parallel_controller.py +115 -80
sglang/srt/managers/detokenizer_manager.py +19 -15
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +340 -109
sglang/srt/managers/mm_utils.py +44 -6
sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +240 -138
sglang/srt/managers/schedule_policy.py +144 -17
sglang/srt/managers/scheduler.py +502 -209
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
sglang/srt/managers/tokenizer_manager.py +320 -632
sglang/srt/managers/tp_worker.py +81 -22
sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +43 -24
sglang/srt/mem_cache/hiradix_cache.py +222 -75
sglang/srt/mem_cache/memory_pool.py +535 -58
sglang/srt/mem_cache/memory_pool_host.py +239 -228
sglang/srt/mem_cache/radix_cache.py +222 -73
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
sglang/srt/mem_cache/swa_radix_cache.py +25 -36
sglang/srt/metrics/collector.py +511 -132
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +52 -37
sglang/srt/model_executor/forward_batch_info.py +82 -40
sglang/srt/model_executor/model_runner.py +432 -157
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +133 -5
sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/model_loader/weight_utils.py +158 -3
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +820 -217
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +607 -130
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -2
sglang/srt/models/glm4_moe.py +4 -4
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +5 -3
sglang/srt/models/glm4v_moe.py +4 -1
sglang/srt/models/gpt_oss.py +8 -31
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +3 -3
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +40 -4
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2_5_vl.py +29 -5
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +120 -13
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +32 -4
sglang/srt/models/qwen3_next.py +1069 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +98 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +153 -129
sglang/srt/multimodal/processors/qwen_vl.py +23 -6
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/offloader.py +27 -3
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +38 -17
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +966 -267
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
sglang/srt/speculative/eagle_worker.py +99 -28
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +52 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +578 -0
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +433 -77
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +2 -2
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +5 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +77 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +383 -5
sglang/utils.py +21 -1
sglang/version.py +1 -1
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} RENAMED Viewed

@@ -38,10 +38,14 @@ from sglang.srt.configs import (
     ChatGLMConfig,
     DbrxConfig,
     DeepseekVL2Config,
+    DotsOCRConfig,
+    DotsVLMConfig,
     ExaoneConfig,
+    FalconH1Config,
     KimiVLConfig,
     LongcatFlashConfig,
     MultiModalityConfig,
+    Qwen3NextConfig,
     Step3VLConfig,
 )
 from sglang.srt.configs.internvl import InternVLChatConfig
@@ -58,6 +62,10 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     InternVLChatConfig.model_type: InternVLChatConfig,
     Step3VLConfig.model_type: Step3VLConfig,
     LongcatFlashConfig.model_type: LongcatFlashConfig,
+    Qwen3NextConfig.model_type: Qwen3NextConfig,
+    FalconH1Config.model_type: FalconH1Config,
+    DotsVLMConfig.model_type: DotsVLMConfig,
+    DotsOCRConfig.model_type: DotsOCRConfig,
 }
 for name, cls in _CONFIG_REGISTRY.items():
@@ -115,6 +123,38 @@ def get_hf_text_config(config: PretrainedConfig):
         return config
+# Temporary hack for DeepSeek-V3.2 model
+def _load_deepseek_v32_model(
+    model_path: str,
+    trust_remote_code: bool = False,
+    revision: Optional[str] = None,
+    **kwargs,
+):
+    # first get the local path
+    local_path = download_from_hf(model_path)
+    # then load the config file in json
+    config_file = os.path.join(local_path, "config.json")
+    if not os.path.exists(config_file):
+        raise RuntimeError(f"Can't find config file in {local_path}.")
+    with open(config_file, "r") as f:
+        config_json = json.load(f)
+    config_json["architectures"] = ["DeepseekV3ForCausalLM"]
+    config_json["model_type"] = "deepseek_v3"
+    tmp_path = os.path.join(local_path, "_tmp_config_folder")
+    os.makedirs(tmp_path, exist_ok=True)
+    unique_path = os.path.join(tmp_path, f"deepseek_v32_{os.getpid()}")
+    with open(unique_path, "w") as f:
+        json.dump(config_json, f)
+    return AutoConfig.from_pretrained(
+        unique_path, trust_remote_code=trust_remote_code, revision=revision, **kwargs
+    )
 @lru_cache_frozenset(maxsize=32)
 def get_config(
     model: str,
@@ -136,9 +176,17 @@ def get_config(
         client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
         model = client.get_local_dir()
-    config = AutoConfig.from_pretrained(
-        model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
-    )
+    try:
+        config = AutoConfig.from_pretrained(
+            model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
+        )
+    except ValueError as e:
+        if not "deepseek_v32" in str(e):
+            raise e
+        config = _load_deepseek_v32_model(
+            model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
+        )
     if (
         config.architectures is not None
         and config.architectures[0] == "Phi4MMForCausalLM"
@@ -370,8 +418,8 @@ def get_processor(
         **kwargs,
     )
-    # fix: for Qwen2-VL model, inject default 'size' if not provided.
-    if config.model_type in {"qwen2_vl"}:
+    # fix: for Qwen2-VL and Sarashina2Vision models, inject default 'size' if not provided.
+    if config.model_type in {"qwen2_vl", "sarashina2_vision"}:
         if "size" not in kwargs:
             kwargs["size"] = {"shortest_edge": 3136, "longest_edge": 1003520}

sglang/srt/{patch_torch.py → utils/patch_torch.py} RENAMED Viewed

@@ -17,10 +17,18 @@ import torch
 from packaging import version
 from torch.multiprocessing import reductions
+from sglang.srt.utils import is_npu
+_is_npu = is_npu()
 def monkey_patch_torch_reductions():
     """Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed"""
+    # Currently, NPU does not support UUID. This has been temporarily commented out, with support expected in the fourth quarter.
+    if _is_npu:
+        return
     if hasattr(reductions, "_reduce_tensor_original"):
         return

sglang/srt/utils/rpd_utils.py ADDED Viewed

@@ -0,0 +1,452 @@
+# https://raw.githubusercontent.com/ROCm/rocmProfileData/refs/heads/master/tools/rpd2tracing.py
+# commit 92d13a08328625463e9ba944cece82fc5eea36e6
+def rpd_to_chrome_trace(
+    input_rpd, output_json=None, start="0%", end="100%", format="object"
+):
+    import gzip
+    import sqlite3
+    if output_json is None:
+        import pathlib
+        output_json = pathlib.PurePath(input_rpd).with_suffix(".trace.json.gz")
+    connection = sqlite3.connect(input_rpd)
+    outfile = gzip.open(output_json, "wt", encoding="utf-8")
+    if format == "object":
+        outfile.write('{"traceEvents": ')
+    outfile.write("[ {}\n")
+    for row in connection.execute("select distinct gpuId from rocpd_op"):
+        try:
+            outfile.write(
+                ',{"name": "process_name", "ph": "M", "pid":"%s","args":{"name":"%s"}}\n'
+                % (row[0], "GPU" + str(row[0]))
+            )
+            outfile.write(
+                ',{"name": "process_sort_index", "ph": "M", "pid":"%s","args":{"sort_index":"%s"}}\n'
+                % (row[0], row[0] + 1000000)
+            )
+        except ValueError:
+            outfile.write("")
+    for row in connection.execute("select distinct pid, tid from rocpd_api"):
+        try:
+            outfile.write(
+                ',{"name":"thread_name","ph":"M","pid":"%s","tid":"%s","args":{"name":"%s"}}\n'
+                % (row[0], row[1], "Hip " + str(row[1]))
+            )
+            outfile.write(
+                ',{"name":"thread_sort_index","ph":"M","pid":"%s","tid":"%s","args":{"sort_index":"%s"}}\n'
+                % (row[0], row[1], row[1] * 2)
+            )
+        except ValueError:
+            outfile.write("")
+    try:
+        # FIXME - these aren't rendering correctly in chrome://tracing
+        for row in connection.execute("select distinct pid, tid from rocpd_hsaApi"):
+            try:
+                outfile.write(
+                    ',{"name":"thread_name","ph":"M","pid":"%s","tid":"%s","args":{"name":"%s"}}\n'
+                    % (row[0], row[1], "HSA " + str(row[1]))
+                )
+                outfile.write(
+                    ',{"name":"thread_sort_index","ph":"M","pid":"%s","tid":"%s","args":{"sort_index":"%s"}}\n'
+                    % (row[0], row[1], row[1] * 2 - 1)
+                )
+            except ValueError:
+                outfile.write("")
+    except:
+        pass
+    rangeStringApi = ""
+    rangeStringOp = ""
+    rangeStringMonitor = ""
+    min_time = connection.execute("select MIN(start) from rocpd_api;").fetchall()[0][0]
+    max_time = connection.execute("select MAX(end) from rocpd_api;").fetchall()[0][0]
+    if min_time == None:
+        raise Exception("Trace file is empty.")
+    print("Timestamps:")
+    print(f"\t    first: \t{min_time/1000} us")
+    print(f"\t     last: \t{max_time/1000} us")
+    print(f"\t duration: \t{(max_time-min_time) / 1000000000} seconds")
+    start_time = min_time / 1000
+    end_time = max_time / 1000
+    if start:
+        if "%" in start:
+            start_time = (
+                (max_time - min_time) * (int(start.replace("%", "")) / 100) + min_time
+            ) / 1000
+        else:
+            start_time = int(start)
+        rangeStringApi = "where rocpd_api.start/1000 >= %s" % (start_time)
+        rangeStringOp = "where rocpd_op.start/1000 >= %s" % (start_time)
+        rangeStringMonitor = "where start/1000 >= %s" % (start_time)
+    if end:
+        if "%" in end:
+            end_time = (
+                (max_time - min_time) * (int(end.replace("%", "")) / 100) + min_time
+            ) / 1000
+        else:
+            end_time = int(end)
+        rangeStringApi = (
+            rangeStringApi + " and rocpd_api.start/1000 <= %s" % (end_time)
+            if start != None
+            else "where rocpd_api.start/1000 <= %s" % (end_time)
+        )
+        rangeStringOp = (
+            rangeStringOp + " and rocpd_op.start/1000 <= %s" % (end_time)
+            if start != None
+            else "where rocpd_op.start/1000 <= %s" % (end_time)
+        )
+        rangeStringMonitor = (
+            rangeStringMonitor + " and start/1000 <= %s" % (end_time)
+            if start != None
+            else "where start/1000 <= %s" % (end_time)
+        )
+    print("\nFilter: %s" % (rangeStringApi))
+    print(f"Output duration: {(end_time-start_time)/1000000} seconds")
+    # Output Ops
+    for row in connection.execute(
+        "select A.string as optype, B.string as description, gpuId, queueId, rocpd_op.start/1000.0, (rocpd_op.end-rocpd_op.start) / 1000.0 from rocpd_op INNER JOIN rocpd_string A on A.id = rocpd_op.opType_id INNER Join rocpd_string B on B.id = rocpd_op.description_id %s"
+        % (rangeStringOp)
+    ):
+        try:
+            name = row[0] if len(row[1]) == 0 else row[1]
+            outfile.write(
+                ',{"pid":"%s","tid":"%s","name":"%s","ts":"%s","dur":"%s","ph":"X","args":{"desc":"%s"}}\n'
+                % (row[2], row[3], name, row[4], row[5], row[0])
+            )
+        except ValueError:
+            outfile.write("")
+    # Output Graph executions on GPU
+    try:
+        for row in connection.execute(
+            "select graphExec, gpuId, queueId, min(start)/1000.0, (max(end)-min(start))/1000.0, count(*) from rocpd_graphLaunchapi A join rocpd_api_ops B on B.api_id = A.api_ptr_id join rocpd_op C on C.id = B.op_id %s group by api_ptr_id"
+            % (rangeStringMonitor)
+        ):
+            try:
+                outfile.write(
+                    ',{"pid":"%s","tid":"%s","name":"%s","ts":"%s","dur":"%s","ph":"X","args":{"kernels":"%s"}}\n'
+                    % (row[1], row[2], f"Graph {row[0]}", row[3], row[4], row[5])
+                )
+            except ValueError:
+                outfile.write("")
+    except:
+        pass
+    # Output apis
+    for row in connection.execute(
+        "select A.string as apiName, B.string as args, pid, tid, rocpd_api.start/1000.0, (rocpd_api.end-rocpd_api.start) / 1000.0, (rocpd_api.end != rocpd_api.start) as has_duration from rocpd_api INNER JOIN rocpd_string A on A.id = rocpd_api.apiName_id INNER Join rocpd_string B on B.id = rocpd_api.args_id %s order by rocpd_api.id"
+        % (rangeStringApi)
+    ):
+        try:
+            if row[0] == "UserMarker":
+                if row[6] == 0:  # instantanuous "mark" messages
+                    outfile.write(
+                        ',{"pid":"%s","tid":"%s","name":"%s","ts":"%s","ph":"i","s":"p","args":{"desc":"%s"}}\n'
+                        % (
+                            row[2],
+                            row[3],
+                            row[1].replace('"', ""),
+                            row[4],
+                            row[1].replace('"', ""),
+                        )
+                    )
+                else:
+                    outfile.write(
+                        ',{"pid":"%s","tid":"%s","name":"%s","ts":"%s","dur":"%s","ph":"X","args":{"desc":"%s"}}\n'
+                        % (
+                            row[2],
+                            row[3],
+                            row[1].replace('"', ""),
+                            row[4],
+                            row[5],
+                            row[1].replace('"', ""),
+                        )
+                    )
+            else:
+                outfile.write(
+                    ',{"pid":"%s","tid":"%s","name":"%s","ts":"%s","dur":"%s","ph":"X","args":{"desc":"%s"}}\n'
+                    % (
+                        row[2],
+                        row[3],
+                        row[0],
+                        row[4],
+                        row[5],
+                        row[1].replace('"', "").replace("\t", ""),
+                    )
+                )
+        except ValueError:
+            outfile.write("")
+    # Output api->op linkage
+    for row in connection.execute(
+        "select rocpd_api_ops.id, pid, tid, gpuId, queueId, rocpd_api.end/1000.0 - 2, rocpd_op.start/1000.0 from rocpd_api_ops INNER JOIN rocpd_api on rocpd_api_ops.api_id = rocpd_api.id INNER JOIN rocpd_op on rocpd_api_ops.op_id = rocpd_op.id %s"
+        % (rangeStringApi)
+    ):
+        try:
+            fromtime = row[5] if row[5] < row[6] else row[6]
+            outfile.write(
+                ',{"pid":"%s","tid":"%s","cat":"api_op","name":"api_op","ts":"%s","id":"%s","ph":"s"}\n'
+                % (row[1], row[2], fromtime, row[0])
+            )
+            outfile.write(
+                ',{"pid":"%s","tid":"%s","cat":"api_op","name":"api_op","ts":"%s","id":"%s","ph":"f", "bp":"e"}\n'
+                % (row[3], row[4], row[6], row[0])
+            )
+        except ValueError:
+            outfile.write("")
+    try:
+        for row in connection.execute(
+            "select A.string as apiName, B.string as args, pid, tid, rocpd_hsaApi.start/1000.0, (rocpd_hsaApi.end-rocpd_hsaApi.start) / 1000.0 from rocpd_hsaApi INNER JOIN rocpd_string A on A.id = rocpd_hsaApi.apiName_id INNER Join rocpd_string B on B.id = rocpd_hsaApi.args_id %s order by rocpd_hsaApi.id"
+            % (rangeStringApi)
+        ):
+            try:
+                outfile.write(
+                    ',{"pid":"%s","tid":"%s","name":"%s","ts":"%s","dur":"%s","ph":"X","args":{"desc":"%s"}}\n'
+                    % (
+                        row[2],
+                        row[3] + 1,
+                        row[0],
+                        row[4],
+                        row[5],
+                        row[1].replace('"', ""),
+                    )
+                )
+            except ValueError:
+                outfile.write("")
+    except:
+        pass
+    #
+    # Counters
+    #
+    # Counters should extend to the last event in the trace.  This means they need to have a value at Tend.
+    # Figure out when that is
+    T_end = 0
+    for row in connection.execute(
+        "SELECT max(end)/1000 from (SELECT end from rocpd_api UNION ALL SELECT end from rocpd_op)"
+    ):
+        T_end = int(row[0])
+    if end:
+        T_end = end_time
+    # Loop over GPU for per-gpu counters
+    gpuIdsPresent = []
+    for row in connection.execute("SELECT DISTINCT gpuId FROM rocpd_op"):
+        gpuIdsPresent.append(row[0])
+    for gpuId in gpuIdsPresent:
+        # print(f"Creating counters for: {gpuId}")
+        # Create the queue depth counter
+        depth = 0
+        idle = 1
+        for row in connection.execute(
+            'select * from (select rocpd_api.start/1000.0 as ts, "1" from rocpd_api_ops INNER JOIN rocpd_api on rocpd_api_ops.api_id = rocpd_api.id INNER JOIN rocpd_op on rocpd_api_ops.op_id = rocpd_op.id AND rocpd_op.gpuId = %s %s UNION ALL select rocpd_op.end/1000.0, "-1" from rocpd_api_ops INNER JOIN rocpd_api on rocpd_api_ops.api_id = rocpd_api.id INNER JOIN rocpd_op on rocpd_api_ops.op_id = rocpd_op.id AND rocpd_op.gpuId = %s %s) order by ts'
+            % (gpuId, rangeStringOp, gpuId, rangeStringOp)
+        ):
+            try:
+                if idle and int(row[1]) > 0:
+                    idle = 0
+                    outfile.write(
+                        ',{"pid":"%s","name":"Idle","ph":"C","ts":%s,"args":{"idle":%s}}\n'
+                        % (gpuId, row[0], idle)
+                    )
+                if depth == 1 and int(row[1]) < 0:
+                    idle = 1
+                    outfile.write(
+                        ',{"pid":"%s","name":"Idle","ph":"C","ts":%s,"args":{"idle":%s}}\n'
+                        % (gpuId, row[0], idle)
+                    )
+                depth = depth + int(row[1])
+                outfile.write(
+                    ',{"pid":"%s","name":"QueueDepth","ph":"C","ts":%s,"args":{"depth":%s}}\n'
+                    % (gpuId, row[0], depth)
+                )
+            except ValueError:
+                outfile.write("")
+        if T_end > 0:
+            outfile.write(
+                ',{"pid":"%s","name":"Idle","ph":"C","ts":%s,"args":{"idle":%s}}\n'
+                % (gpuId, T_end, idle)
+            )
+            outfile.write(
+                ',{"pid":"%s","name":"QueueDepth","ph":"C","ts":%s,"args":{"depth":%s}}\n'
+                % (gpuId, T_end, depth)
+            )
+    # Create SMI counters
+    try:
+        for row in connection.execute(
+            "select deviceId, monitorType, start/1000.0, value from rocpd_monitor %s"
+            % (rangeStringMonitor)
+        ):
+            outfile.write(
+                ',{"pid":"%s","name":"%s","ph":"C","ts":%s,"args":{"%s":%s}}\n'
+                % (row[0], row[1], row[2], row[1], row[3])
+            )
+        # Output the endpoints of the last range
+        for row in connection.execute(
+            "select distinct deviceId, monitorType, max(end)/1000.0, value from rocpd_monitor %s group by deviceId, monitorType"
+            % (rangeStringMonitor)
+        ):
+            outfile.write(
+                ',{"pid":"%s","name":"%s","ph":"C","ts":%s,"args":{"%s":%s}}\n'
+                % (row[0], row[1], row[2], row[1], row[3])
+            )
+    except:
+        print("Did not find SMI data")
+    # Create the (global) memory counter
+    """
+    sizes = {}    # address -> size
+    totalSize = 0
+    exp = re.compile("^ptr\((.*)\)\s+size\((.*)\)$")
+    exp2 = re.compile("^ptr\((.*)\)$")
+    for row in connection.execute("SELECT rocpd_api.end/1000.0 as ts, B.string, '1'  FROM rocpd_api INNER JOIN rocpd_string A ON A.id=rocpd_api.apiName_id INNER JOIN rocpd_string B ON B.id=rocpd_api.args_id WHERE A.string='hipFree' UNION ALL SELECT rocpd_api.start/1000.0, B.string, '0' FROM rocpd_api INNER JOIN rocpd_string A ON A.id=rocpd_api.apiName_id INNER JOIN rocpd_string B ON B.id=rocpd_api.args_id WHERE A.string='hipMalloc' ORDER BY ts asc"):
+        try:
+            if row[2] == '0':  #malloc
+                m = exp.match(row[1])
+                if m:
+                    size = int(m.group(2), 16)
+                    totalSize = totalSize + size
+                    sizes[m.group(1)] = size
+                    outfile.write(',{"pid":"0","name":"Allocated Memory","ph":"C","ts":%s,"args":{"depth":%s}}\n'%(row[0],totalSize))
+            else:              #free
+                m = exp2.match(row[1])
+                if m:
+                    try:    # Sometimes free addresses are not valid or listed
+                        size = sizes[m.group(1)]
+                        sizes[m.group(1)] = 0
+                        totalSize = totalSize - size;
+                        outfile.write(',{"pid":"0","name":"Allocated Memory","ph":"C","ts":%s,"args":{"depth":%s}}\n'%(row[0],totalSize))
+                    except KeyError:
+                        pass
+        except ValueError:
+            outfile.write("")
+    if T_end > 0:
+        outfile.write(',{"pid":"0","name":"Allocated Memory","ph":"C","ts":%s,"args":{"depth":%s}}\n'%(T_end,totalSize))
+    """
+    # Create "faux calling stack frame" on gpu ops traceS
+    stacks = {}  # Call stacks built from UserMarker entres.     Key is 'pid,tid'
+    currentFrame = {}  # "Current GPU frame" (id, name, start, end).    Key is 'pid,tid'
+    class GpuFrame:
+        def __init__(self):
+            self.id = 0
+            self.name = ""
+            self.start = 0
+            self.end = 0
+            self.gpus = []
+            self.totalOps = 0
+    # FIXME: include 'start' (in ns) so we can ORDER BY it and break ties?
+    for row in connection.execute(
+        "SELECT '0', start/1000.0, pid, tid, B.string as label, '','','', '' from rocpd_api INNER JOIN rocpd_string A on A.id = rocpd_api.apiName_id AND A.string = 'UserMarker' INNER JOIN rocpd_string B on B.id = rocpd_api.args_id AND rocpd_api.start/1000.0 != rocpd_api.end/1000.0 %s UNION ALL SELECT '1', end/1000.0, pid, tid, B.string as label, '','','', '' from rocpd_api INNER JOIN rocpd_string A on A.id = rocpd_api.apiName_id AND A.string = 'UserMarker' INNER JOIN rocpd_string B on B.id = rocpd_api.args_id AND rocpd_api.start/1000.0 != rocpd_api.end/1000.0 %s UNION ALL SELECT '2', rocpd_api.start/1000.0, pid, tid, '' as label, gpuId, queueId, rocpd_op.start/1000.0, rocpd_op.end/1000.0 from rocpd_api_ops INNER JOIN rocpd_api ON rocpd_api_ops.api_id = rocpd_api.id INNER JOIN rocpd_op ON rocpd_api_ops.op_id = rocpd_op.id %s ORDER BY start/1000.0 asc"
+        % (rangeStringApi, rangeStringApi, rangeStringApi)
+    ):
+        try:
+            key = (row[2], row[3])  # Key is 'pid,tid'
+            if row[0] == "0":  # Frame start
+                if key not in stacks:
+                    stacks[key] = []
+                stack = stacks[key].append((row[1], row[4]))
+                # print(f"0: new api frame: pid_tid={key} -> stack={stacks}")
+            elif row[0] == "1":  # Frame end
+                completed = stacks[key].pop()
+                # print(f"1: end api frame: pid_tid={key} -> stack={stacks}")
+            elif row[0] == "2":  # API + Op
+                if key in stacks and len(stacks[key]) > 0:
+                    frame = stacks[key][-1]
+                    # print(f"2: Op on {frame} ({len(stacks[key])})")
+                    gpuFrame = None
+                    if key not in currentFrame:  # First op under the current api frame
+                        gpuFrame = GpuFrame()
+                        gpuFrame.id = frame[0]
+                        gpuFrame.name = frame[1]
+                        gpuFrame.start = row[7]
+                        gpuFrame.end = row[8]
+                        gpuFrame.gpus.append((row[5], row[6]))
+                        gpuFrame.totalOps = 1
+                        # print(f"2a: new frame: {gpuFrame.gpus} {gpuFrame.start} {gpuFrame.end} {gpuFrame.end - gpuFrame.start}")
+                    else:
+                        gpuFrame = currentFrame[key]
+                        # Another op under the same frame -> union them (but only if they are butt together)
+                        if (
+                            gpuFrame.id == frame[0]
+                            and gpuFrame.name == frame[1]
+                            and (
+                                abs(row[7] - gpuFrame.end) < 200
+                                or abs(gpuFrame.start - row[8]) < 200
+                            )
+                        ):
+                            # if gpuFrame.id == frame[0] and gpuFrame.name == frame[1]:    # Another op under the same frame -> union them
+                            # if False:   # Turn off frame joining
+                            if row[7] < gpuFrame.start:
+                                gpuFrame.start = row[7]
+                            if row[8] > gpuFrame.end:
+                                gpuFrame.end = row[8]
+                            if (row[5], row[6]) not in gpuFrame.gpus:
+                                gpuFrame.gpus.append((row[5], row[6]))
+                            gpuFrame.totalOps = gpuFrame.totalOps + 1
+                            # print(f"2c: union frame: {gpuFrame.gpus} {gpuFrame.start} {gpuFrame.end} {gpuFrame.end - gpuFrame.start}")
+                        else:  # This is a new frame - dump the last and make new
+                            gpuFrame = currentFrame[key]
+                            for dest in gpuFrame.gpus:
+                                # print(f"2: OUTPUT: dest={dest} time={gpuFrame.start} -> {gpuFrame.end} Duration={gpuFrame.end - gpuFrame.start} TotalOps={gpuFrame.totalOps}")
+                                outfile.write(
+                                    ',{"pid":"%s","tid":"%s","name":"%s","ts":"%s","dur":"%s","ph":"X","args":{"desc":"%s"}}\n'
+                                    % (
+                                        dest[0],
+                                        dest[1],
+                                        gpuFrame.name.replace('"', ""),
+                                        gpuFrame.start - 1,
+                                        gpuFrame.end - gpuFrame.start + 1,
+                                        f"UserMarker frame: {gpuFrame.totalOps} ops",
+                                    )
+                                )
+                            currentFrame.pop(key)
+                            # make the first op under the new frame
+                            gpuFrame = GpuFrame()
+                            gpuFrame.id = frame[0]
+                            gpuFrame.name = frame[1]
+                            gpuFrame.start = row[7]
+                            gpuFrame.end = row[8]
+                            gpuFrame.gpus.append((row[5], row[6]))
+                            gpuFrame.totalOps = 1
+                            # print(f"2b: new frame: {gpuFrame.gpus} {gpuFrame.start} {gpuFrame.end} {gpuFrame.end - gpuFrame.start}")
+                    currentFrame[key] = gpuFrame
+        except ValueError:
+            outfile.write("")
+    outfile.write("]\n")
+    if format == "object":
+        outfile.write("} \n")
+    outfile.close()
+    connection.close()

sglang/srt/utils/slow_rank_detector.py ADDED Viewed

@@ -0,0 +1,71 @@
+import logging
+from typing import Any, Dict, List
+import torch
+import torch.distributed as dist
+import triton
+logger = logging.getLogger(__name__)
+def execute():
+    if dist.get_rank() == 0:
+        logger.info(f"[slow_rank_detector] Start benchmarking...")
+    local_metrics = {
+        bench_name: _compute_local_metric(bench_name) for bench_name in _BENCH_NAMES
+    }
+    all_metrics = [None for _ in range(dist.get_world_size())]
+    dist.gather_object(local_metrics, all_metrics if dist.get_rank() == 0 else None)
+    if dist.get_rank() == 0:
+        _analyze_metrics(all_metrics)
+class _GemmExecutor:
+    def __init__(self):
+        self.lhs = torch.randn((8192, 8192), dtype=torch.bfloat16, device="cuda")
+        self.rhs = torch.randn((8192, 8192), dtype=torch.bfloat16, device="cuda")
+    def __call__(self):
+        self.lhs @ self.rhs
+class _ElementwiseExecutor:
+    def __init__(self):
+        self.value = torch.randint(
+            0, 10000, (128 * 1024**2,), dtype=torch.int32, device="cuda"
+        )
+    def __call__(self):
+        self.value += 1
+_EXECUTOR_CLS_OF_BENCH = {
+    "gemm": _GemmExecutor,
+    "elementwise": _ElementwiseExecutor,
+}
+_BENCH_NAMES = list(_EXECUTOR_CLS_OF_BENCH.keys())
+def _compute_local_metric(bench_name):
+    executor = _EXECUTOR_CLS_OF_BENCH[bench_name]()
+    ms = triton.testing.do_bench_cudagraph(executor, return_mode="mean", rep=20)
+    return ms
+def _analyze_metrics(all_metrics: List[Dict[str, Any]]):
+    for bench_name in _BENCH_NAMES:
+        time_of_rank = torch.tensor([m[bench_name] for m in all_metrics])
+        speed_of_rank = 1 / time_of_rank
+        rel_speed_of_rank = speed_of_rank / speed_of_rank.max()
+        slowest_rel_speed = rel_speed_of_rank.min().item()
+        logger.info(
+            f"[slow_rank_detector] {bench_name=} {slowest_rel_speed=} {rel_speed_of_rank=} {time_of_rank=}"
+        )
+        if slowest_rel_speed < 0.9:
+            logger.warning(
+                "[slow_rank_detector] Some ranks are too slow compared with others"
+            )

sglang/srt/warmup.py CHANGED Viewed

@@ -1,20 +1,24 @@
+from __future__ import annotations
 import logging
-from typing import List
+from typing import TYPE_CHECKING, List
 import numpy as np
 import tqdm
 from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
 from sglang.srt.managers.io_struct import GenerateReqInput
-from sglang.srt.managers.tokenizer_manager import TokenizerManager
+if TYPE_CHECKING:
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
 logger = logging.getLogger(__file__)
 _warmup_registry = {}
-def warmup(name: str) -> callable:
-    def decorator(fn: callable):
+def warmup(name: str):
+    def decorator(fn):
         _warmup_registry[name] = fn
         return fn

sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.2rc2py3-none-any.whl → 0.5.3rc2py3-none-any.whl