PyPI - sglang - Versions diffs - 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl - Mend

sglang 0.5.2rc2py3-none-any.whl → 0.5.3.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (396) hide show

sglang/bench_one_batch.py +7 -11
sglang/bench_one_batch_server.py +330 -31
sglang/bench_serving.py +474 -142
sglang/compile_deep_gemm.py +3 -0
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +10 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/falcon_h1.py +314 -0
sglang/srt/configs/load_config.py +9 -0
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +228 -92
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/qwen3_next.py +294 -0
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +49 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +30 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +21 -6
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +279 -108
sglang/srt/disaggregation/decode.py +78 -37
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +55 -537
sglang/srt/disaggregation/nixl/conn.py +373 -68
sglang/srt/disaggregation/prefill.py +53 -49
sglang/srt/disaggregation/utils.py +40 -54
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +156 -80
sglang/srt/entrypoints/engine.py +59 -18
sglang/srt/entrypoints/grpc_request_manager.py +842 -0
sglang/srt/entrypoints/grpc_server.py +950 -0
sglang/srt/entrypoints/http_server.py +179 -60
sglang/srt/entrypoints/openai/protocol.py +265 -29
sglang/srt/entrypoints/openai/serving_base.py +65 -3
sglang/srt/entrypoints/openai/serving_chat.py +213 -122
sglang/srt/entrypoints/openai/serving_completions.py +14 -3
sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +48 -3
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +289 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +38 -8
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +17 -8
sglang/srt/function_call/glm4_moe_detector.py +4 -4
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
sglang/srt/layers/activation.py +143 -9
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +115 -9
sglang/srt/layers/attention/attention_registry.py +215 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +40 -8
sglang/srt/layers/attention/flashinfer_backend.py +341 -204
sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
sglang/srt/layers/attention/mamba/mamba.py +577 -0
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +57 -7
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +41 -2
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +34 -15
sglang/srt/layers/linear.py +55 -7
sglang/srt/layers/logits_processor.py +180 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
sglang/srt/layers/moe/ep_moe/layer.py +248 -333
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +83 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +29 -7
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +155 -60
sglang/srt/layers/quantization/fp8_utils.py +51 -32
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +191 -56
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +74 -42
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +28 -33
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +91 -41
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +213 -21
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +99 -5
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +8 -3
sglang/srt/lora/lora_manager.py +44 -118
sglang/srt/lora/mem_pool.py +25 -11
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +22 -11
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +199 -301
sglang/srt/managers/data_parallel_controller.py +115 -80
sglang/srt/managers/detokenizer_manager.py +19 -15
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +340 -109
sglang/srt/managers/mm_utils.py +44 -6
sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +55 -0
sglang/srt/managers/schedule_batch.py +343 -212
sglang/srt/managers/schedule_policy.py +145 -18
sglang/srt/managers/scheduler.py +653 -273
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
sglang/srt/managers/tokenizer_manager.py +579 -674
sglang/srt/managers/tp_worker.py +96 -26
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +9 -2
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +43 -24
sglang/srt/mem_cache/hiradix_cache.py +222 -75
sglang/srt/mem_cache/memory_pool.py +651 -80
sglang/srt/mem_cache/memory_pool_host.py +239 -228
sglang/srt/mem_cache/radix_cache.py +227 -73
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
sglang/srt/mem_cache/swa_radix_cache.py +93 -48
sglang/srt/metrics/collector.py +511 -132
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +52 -37
sglang/srt/model_executor/forward_batch_info.py +74 -46
sglang/srt/model_executor/model_runner.py +455 -176
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/__init__.py +10 -4
sglang/srt/model_loader/loader.py +319 -10
sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/model_loader/weight_utils.py +161 -3
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +820 -217
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +607 -130
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/falcon_h1.py +578 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -2
sglang/srt/models/glm4_moe.py +4 -4
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +5 -3
sglang/srt/models/glm4v_moe.py +4 -1
sglang/srt/models/gpt_oss.py +8 -31
sglang/srt/models/grok.py +5 -13
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +3 -3
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mixtral.py +1 -3
sglang/srt/models/mllama4.py +50 -4
sglang/srt/models/nemotron_h.py +514 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2_5_vl.py +29 -5
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +120 -13
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +32 -4
sglang/srt/models/qwen3_next.py +1069 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +55 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +98 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +153 -129
sglang/srt/multimodal/processors/qwen_vl.py +23 -6
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/offloader.py +27 -3
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +49 -26
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +1051 -285
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
sglang/srt/speculative/eagle_worker.py +98 -29
sglang/srt/speculative/ngram_info.py +428 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +52 -0
sglang/srt/speculative/spec_utils.py +605 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +578 -0
sglang/srt/two_batch_overlap.py +9 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +451 -77
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +2 -2
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/get_logits_ut.py +57 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +119 -11
sglang/test/runners.py +5 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_longbench_v2.py +332 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_cutlass_w4a8_moe.py +9 -19
sglang/test/test_deterministic.py +313 -0
sglang/test/test_deterministic_utils.py +81 -0
sglang/test/test_disaggregation_utils.py +140 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +407 -8
sglang/utils.py +21 -1
sglang/version.py +1 -1
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/npu_graph_runner.py CHANGED Viewed

@@ -19,8 +19,10 @@ import logging
 import threading
 from typing import TYPE_CHECKING, Optional, Union
+import numpy as np
 import torch
+from sglang.srt.configs.model_config import AttentionArch
 from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
 logger = logging.getLogger(__name__)
@@ -73,11 +75,16 @@ class NPUGraphRunner(CudaGraphRunner):
             self.positions[: self.raw_num_token].copy_(forward_batch.positions)
         # Replay
-        seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * (self.bs - self.raw_bs)
-        thread = threading.Thread(target=self._update_inputs, args=(seq_lens,))
-        thread.start()
-        self.graphs[self.bs].replay()
-        thread.join()
+        if self.model_runner.model_config.index_head_dim is None:
+            seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * (
+                self.bs - self.raw_bs
+            )
+            thread = threading.Thread(target=self._update_inputs, args=(seq_lens,))
+            thread.start()
+            self.graphs[self.bs].replay()
+            thread.join()
+        else:
+            self.graphs[self.bs].replay()
         output = self.output_buffers[self.bs]
         if isinstance(output, LogitsProcessorOutput):

sglang/srt/model_loader/__init__.py CHANGED Viewed

@@ -1,16 +1,22 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/__init__.py
+from __future__ import annotations
+from typing import TYPE_CHECKING
 from torch import nn
-from sglang.srt.configs.device_config import DeviceConfig
-from sglang.srt.configs.load_config import LoadConfig
-from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.model_loader.loader import BaseModelLoader, get_model_loader
 from sglang.srt.model_loader.utils import (
     get_architecture_class_name,
     get_model_architecture,
 )
+if TYPE_CHECKING:
+    from sglang.srt.configs.device_config import DeviceConfig
+    from sglang.srt.configs.load_config import LoadConfig
+    from sglang.srt.configs.model_config import ModelConfig
 def get_model(
     *,
@@ -18,7 +24,7 @@ def get_model(
     load_config: LoadConfig,
     device_config: DeviceConfig,
 ) -> nn.Module:
-    loader = get_model_loader(load_config)
+    loader = get_model_loader(load_config, model_config)
     return loader.load_model(
         model_config=model_config,
         device_config=device_config,

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -1,5 +1,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/model_loader/loader.py
+from __future__ import annotations
 # ruff: noqa: SIM117
 import collections
 import concurrent
@@ -10,25 +12,50 @@ import json
 import logging
 import math
 import os
+import re
+import socket
+import threading
 import time
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
-from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    cast,
+)
+from urllib.parse import urlparse
 import huggingface_hub
 import numpy as np
+import requests
 import safetensors.torch
 import torch
+# Try to import accelerate (optional dependency)
+try:
+    from accelerate import infer_auto_device_map, init_empty_weights
+    from accelerate.utils import get_max_memory
+    HAS_ACCELERATE = True
+except ImportError:
+    HAS_ACCELERATE = False
+    infer_auto_device_map = None
+    init_empty_weights = None
+    get_max_memory = None
 from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
-from tqdm.auto import tqdm
-from transformers import AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
-from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig, LoadFormat
-from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.connector import (
     ConnectorType,
     create_remote_connector,
@@ -39,14 +66,24 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
+from sglang.srt.layers.modelopt_utils import QUANT_CFG_CHOICES
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
+    trigger_transferring_weights_request,
+)
 from sglang.srt.model_loader.utils import (
     get_model_architecture,
     post_load_weights,
     set_default_torch_dtype,
 )
+# Constants for memory management
+DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION = (
+    0.8  # Reserve 20% GPU memory headroom for ModelOpt calibration
+)
 from sglang.srt.model_loader.weight_utils import (
     _BAR_FORMAT,
+    default_weight_loader,
     download_safetensors_index_file_from_hf,
     download_weights_from_hf,
     filter_duplicate_safetensors_files,
@@ -70,7 +107,14 @@ from sglang.srt.utils import (
     set_weight_attrs,
 )
+if TYPE_CHECKING:
+    from sglang.srt.configs.device_config import DeviceConfig
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
 _is_npu = is_npu()
+# ModelOpt: QUANT_CFG_CHOICES is imported from modelopt_utils.py
+# which contains the complete mapping of quantization config choices
 @contextmanager
@@ -183,7 +227,10 @@ def _initialize_model(
     if _is_npu:
         packed_modules_mapping.update(
             {
-                "visual": {"qkv_proj": ["qkv"]},
+                "visual": {
+                    "qkv_proj": ["qkv"],
+                    "gate_up_proj": ["gate_proj", "up_proj"],
+                },
                 "vision_model": {
                     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
                     "proj": ["out_proj"],
@@ -451,12 +498,78 @@ class DefaultModelLoader(BaseModelLoader):
             model_config.model_path, model_config.revision, fall_back_to_pt=True
         )
+    def _load_modelopt_base_model(self, model_config: ModelConfig) -> nn.Module:
+        """Load and prepare the base model for ModelOpt quantization.
+        This method handles the common model loading logic shared between
+        DefaultModelLoader (conditional) and ModelOptModelLoader (dedicated).
+        """
+        if not HAS_ACCELERATE:
+            raise ImportError(
+                "accelerate is required for ModelOpt quantization. "
+                "Please install it with: pip install accelerate"
+            )
+        hf_config = AutoConfig.from_pretrained(
+            model_config.model_path, trust_remote_code=True
+        )
+        with init_empty_weights():
+            torch_dtype = getattr(hf_config, "torch_dtype", torch.float16)
+            model = AutoModelForCausalLM.from_config(
+                hf_config, torch_dtype=torch_dtype, trust_remote_code=True
+            )
+        max_memory = get_max_memory()
+        inferred_device_map = infer_auto_device_map(model, max_memory=max_memory)
+        on_cpu = "cpu" in inferred_device_map.values()
+        model_kwargs = {"torch_dtype": "auto"}
+        device_map = "auto"
+        if on_cpu:
+            for device in max_memory.keys():
+                if isinstance(device, int):
+                    max_memory[device] *= DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION
+            logger.warning(
+                "Model does not fit to the GPU mem. "
+                f"We apply the following memory limit for calibration: \n{max_memory}\n"
+                f"If you hit GPU OOM issue, please adjust the memory fraction "
+                f"(currently {DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION}) or "
+                "reduce the calibration `batch_size` manually."
+            )
+            model_kwargs["max_memory"] = max_memory
+        model = AutoModelForCausalLM.from_pretrained(
+            model_config.model_path,
+            device_map=device_map,
+            **model_kwargs,
+            trust_remote_code=True,
+        )
+        logger.info(f"ModelOpt quantization requested: {model_config.modelopt_quant}")
+        quant_choice_str = model_config.modelopt_quant
+        if not isinstance(quant_choice_str, str):
+            raise TypeError(
+                f"modelopt_quant must be a string preset key (e.g., 'fp8'), "
+                f"got {type(quant_choice_str)}"
+            )
+        return model
     def load_model(
         self,
         *,
         model_config: ModelConfig,
         device_config: DeviceConfig,
     ) -> nn.Module:
+        if hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant:
+            # Load base model using shared method
+            model = self._load_modelopt_base_model(model_config)
+            # Note: DefaultModelLoader doesn't do additional quantization processing
+            # For full ModelOpt quantization, use ModelOptModelLoader
+            return model.eval()
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
@@ -465,9 +578,9 @@ class DefaultModelLoader(BaseModelLoader):
                     self.load_config,
                 )
-        self.load_weights_and_postprocess(
-            model, self._get_all_weights(model_config, model), target_device
-        )
+            self.load_weights_and_postprocess(
+                model, self._get_all_weights(model_config, model), target_device
+            )
         return model.eval()
@@ -1366,6 +1479,105 @@ class GGUFModelLoader(BaseModelLoader):
         return model
+class RemoteInstanceModelLoader(BaseModelLoader):
+    """Model loader that can load Tensors from remote sglang instance."""
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(
+                f"Model loader extra config is not supported for "
+                f"load format {load_config.load_format}"
+            )
+    def download_model(self, model_config: ModelConfig) -> None:
+        raise NotImplementedError
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        logger.info("Loading weights from remote instance ...")
+        load_config = self.load_config
+        assert load_config.load_format == LoadFormat.REMOTE_INSTANCE, (
+            f"Model loader {self.load_config.load_format} is not supported for "
+            f"load format {load_config.load_format}"
+        )
+        model_weights = f"instance://{load_config.remote_instance_weight_loader_seed_instance_ip}:{load_config.remote_instance_weight_loader_send_weights_group_ports[load_config.tp_rank]}"
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config)
+            with create_remote_connector(model_weights, device_config.device) as client:
+                connector_type = get_connector_type(client)
+                if connector_type == ConnectorType.INSTANCE:
+                    self.load_model_from_remote_instance(
+                        model, client, model_config, device_config
+                    )
+                else:
+                    raise ValueError(
+                        f"Unsupported connector type {connector_type} for "
+                        f"remote tensor model loading."
+                    )
+        return model.eval()
+    def load_model_from_remote_instance(
+        self, model, client, model_config: ModelConfig, device_config: DeviceConfig
+    ) -> nn.Module:
+        load_config = self.load_config
+        instance_ip = socket.gethostbyname(socket.gethostname())
+        start_build_group_tic = time.time()
+        client.build_group(
+            gpu_id=device_config.gpu_id,
+            tp_rank=load_config.tp_rank,
+            instance_ip=instance_ip,
+        )
+        torch.cuda.synchronize()
+        end_build_group_tic = time.time()
+        logger.debug(
+            f"finish building group for remote instance, time used: {(end_build_group_tic - start_build_group_tic):.4f}s"
+        )
+        if load_config.tp_rank == 0:
+            t = threading.Thread(
+                target=trigger_transferring_weights_request,
+                args=(
+                    load_config.remote_instance_weight_loader_seed_instance_ip,
+                    load_config.remote_instance_weight_loader_seed_instance_service_port,
+                    load_config.remote_instance_weight_loader_send_weights_group_ports,
+                    instance_ip,
+                ),
+            )
+            t.start()
+        start_get_weights_tic = time.time()
+        with set_default_torch_dtype(model_config.dtype):
+            for _, tensor in model.named_parameters():
+                torch.distributed.broadcast(
+                    tensor.data,
+                    src=0,
+                    group=client._model_update_group,
+                )
+            torch.cuda.synchronize()
+            if hasattr(model, "post_load_weights"):
+                model.post_load_weights()
+        end_get_weights_tic = time.time()
+        logger.debug(
+            f"finish getting all weights from remote instance, time used: {(end_get_weights_tic - start_get_weights_tic):.4f}s"
+        )
+        # destroy the process group after loading weights
+        torch.distributed.distributed_c10d.destroy_process_group(
+            client._model_update_group
+        )
+        torch.cuda.empty_cache()
 class RemoteModelLoader(BaseModelLoader):
     """Model loader that can load Tensors from remote database."""
@@ -1543,9 +1755,103 @@ def load_model_with_cpu_quantization(
     return model.eval()
-def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+class ModelOptModelLoader(DefaultModelLoader):
+    """
+    Model loader that applies NVIDIA Model Optimizer quantization
+    """
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        # Any ModelOpt specific initialization if needed
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        logger.info("ModelOptModelLoader: Loading base model...")
+        # Use shared method from parent class to load base model
+        model = self._load_modelopt_base_model(model_config)
+        # Import ModelOpt modules (already done in _load_modelopt_base_model, but needed here for quantization)
+        try:
+            import modelopt.torch.quantization as mtq
+            from modelopt.torch.utils.dataset_utils import create_forward_loop
+        except ImportError:
+            logger.error(
+                "NVIDIA Model Optimizer (modelopt) library not found. "
+                "Please install it to use 'modelopt_quant' feature."
+            )
+            raise
+        quant_choice_str = model_config.modelopt_quant
+        quant_cfg_name = QUANT_CFG_CHOICES.get(quant_choice_str)
+        if not quant_cfg_name:
+            raise ValueError(
+                f"Invalid modelopt_quant choice: '{quant_choice_str}'. "
+                f"Available choices in QUANT_CFG_CHOICES: {list(QUANT_CFG_CHOICES.keys())}. "
+                "Ensure QUANT_CFG_CHOICES is correctly defined with mappings to "
+                "attribute names of config objects in modelopt.torch.quantization."
+            )
+        try:
+            # getattr will fetch the config object, e.g., mtq.FP8_DEFAULT_CFG
+            quant_cfg = getattr(mtq, quant_cfg_name)
+        except AttributeError:
+            raise AttributeError(
+                f"ModelOpt quantization config attribute '{quant_cfg_name}' "
+                f"(from choice '{quant_choice_str}') not found in modelopt.torch.quantization module. "
+                "Please verify QUANT_CFG_CHOICES and the ModelOpt library."
+            )
+        # For now, assume no calibration. Calibration setup is a separate, more complex step.
+        use_calibration = False  # This would ideally be a configurable parameter
+        calib_dataloader = None  # This would need to be provided/configured
+        calibrate_loop = (
+            create_forward_loop(dataloader=calib_dataloader)
+            if use_calibration
+            else None
+        )
+        if use_calibration and calib_dataloader is None:
+            logger.warning(
+                "ModelOpt calibration requested but no calib_dataloader provided. "
+                "Proceeding without calibration. Quantization accuracy may be affected."
+            )
+        logger.info(
+            f"Quantizing model with ModelOpt using config attribute: mtq.{quant_cfg_name}"
+        )
+        try:
+            model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+            logger.info("Model successfully quantized with ModelOpt.")
+        except Exception as e:
+            logger.error(f"Error during ModelOpt mtq.quantize call: {e}")
+            raise
+        mtq.print_quant_summary(model)
+        return model.eval()
+def get_model_loader(
+    load_config: LoadConfig, model_config: Optional[ModelConfig] = None
+) -> BaseModelLoader:
     """Get a model loader based on the load format."""
+    if (
+        model_config
+        and hasattr(model_config, "modelopt_quant")
+        and model_config.modelopt_quant
+    ):
+        logger.info("Using ModelOptModelLoader due to 'modelopt_quant' config.")
+        return ModelOptModelLoader(load_config)
     if isinstance(load_config.load_format, type):
         return load_config.load_format(load_config)
@@ -1567,4 +1873,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.REMOTE:
         return RemoteModelLoader(load_config)
+    if load_config.load_format == LoadFormat.REMOTE_INSTANCE:
+        return RemoteInstanceModelLoader(load_config)
     return DefaultModelLoader(load_config)

sglang/srt/model_loader/remote_instance_weight_loader_utils.py ADDED Viewed

@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import List
+import requests
+logger = logging.getLogger(__name__)
+def trigger_init_weights_send_group_for_remote_instance_request(
+    remote_instance_weight_loader_seed_instance_ip: str,
+    remote_instance_weight_loader_seed_instance_service_port: int,
+    remote_instance_weight_loader_send_weights_group_ports: List[int],
+    remote_instance_weight_loader_client_id: str,
+):
+    seed_instance_service_url = f"http://{remote_instance_weight_loader_seed_instance_ip}:{remote_instance_weight_loader_seed_instance_service_port}"
+    # Only support loading weights from instance with same parallelism strategy.
+    # Per TP rank pair between seed and dst instances will build a communication group for sending weights.
+    # i.e. seed TP 0 <-> dst TP 0, seed TP 1 <-> dst TP 1, etc.
+    # Each communication group will have a world size 2.
+    try:
+        requests.post(
+            f"{seed_instance_service_url}/init_weights_send_group_for_remote_instance",
+            json={
+                "master_address": remote_instance_weight_loader_seed_instance_ip,
+                "ports": (
+                    ",".join(
+                        str(p)
+                        for p in remote_instance_weight_loader_send_weights_group_ports
+                    )
+                ),
+                "group_rank": 0,
+                "world_size": 2,
+                "group_name": f"send_weights_{remote_instance_weight_loader_client_id}",
+                "backend": "nccl",
+            },
+        )
+    except Exception as e:
+        logger.error(
+            f"Failed to trigger init_weights_send_group_for_remote_instance_request to seed instance {seed_instance_service_url}: {e}."
+        )
+        raise
+def trigger_transferring_weights_request(
+    remote_instance_weight_loader_seed_instance_ip: str,
+    remote_instance_weight_loader_seed_instance_service_port: int,
+    remote_instance_weight_loader_send_weights_group_ports: List[int],
+    remote_instance_weight_loader_client_id: str,
+):
+    seed_instance_service_url = f"http://{remote_instance_weight_loader_seed_instance_ip}:{remote_instance_weight_loader_seed_instance_service_port}"
+    try:
+        requests.post(
+            f"{seed_instance_service_url}/send_weights_to_remote_instance",
+            json={
+                "master_address": remote_instance_weight_loader_seed_instance_ip,
+                "ports": (
+                    ",".join(
+                        str(p)
+                        for p in remote_instance_weight_loader_send_weights_group_ports
+                    )
+                ),
+                "group_name": f"send_weights_{remote_instance_weight_loader_client_id}",
+            },
+        )
+    except Exception as e:
+        logger.error(f"Failed to trigger send weights to remote instance request: {e}")
+        raise

sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

sglang 0.5.2rc2py3-none-any.whl → 0.5.3.post1py3-none-any.whl