sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +130 -59
- sglang/srt/entrypoints/openai/protocol.py +112 -4
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +204 -55
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -6
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +190 -55
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +144 -17
- sglang/srt/managers/scheduler.py +502 -209
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +320 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +82 -40
- sglang/srt/model_executor/model_runner.py +432 -157
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +966 -267
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +99 -28
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +433 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -22,16 +22,17 @@ from typing import List, Optional, Set, Union
|
|
22
22
|
import torch
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
|
25
|
-
from sglang.srt.
|
25
|
+
from sglang.srt.environ import envs
|
26
|
+
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
27
|
+
from sglang.srt.server_args import ServerArgs
|
28
|
+
from sglang.srt.utils import is_hip, retry
|
29
|
+
from sglang.srt.utils.hf_transformers_utils import (
|
26
30
|
get_config,
|
27
31
|
get_context_length,
|
28
32
|
get_generation_config,
|
29
33
|
get_hf_text_config,
|
30
34
|
get_sparse_attention_config,
|
31
35
|
)
|
32
|
-
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
33
|
-
from sglang.srt.server_args import ServerArgs
|
34
|
-
from sglang.srt.utils import get_bool_env_var, is_hip
|
35
36
|
from sglang.utils import is_in_ci
|
36
37
|
|
37
38
|
logger = logging.getLogger(__name__)
|
@@ -48,6 +49,30 @@ class ModelImpl(str, Enum):
|
|
48
49
|
TRANSFORMERS = "transformers"
|
49
50
|
|
50
51
|
|
52
|
+
def is_deepseek_nsa(config: PretrainedConfig) -> bool:
|
53
|
+
return (
|
54
|
+
config.architectures is not None
|
55
|
+
and config.architectures[0]
|
56
|
+
in ["DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM"]
|
57
|
+
and getattr(config, "index_topk", None) is not None
|
58
|
+
)
|
59
|
+
|
60
|
+
|
61
|
+
def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
|
62
|
+
assert is_deepseek_nsa(config)
|
63
|
+
return config.index_head_dim
|
64
|
+
|
65
|
+
|
66
|
+
def get_nsa_index_topk(config: PretrainedConfig) -> int:
|
67
|
+
assert is_deepseek_nsa(config)
|
68
|
+
return config.index_topk
|
69
|
+
|
70
|
+
|
71
|
+
def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
|
72
|
+
assert is_deepseek_nsa(config)
|
73
|
+
return config.index_n_heads
|
74
|
+
|
75
|
+
|
51
76
|
class ModelConfig:
|
52
77
|
def __init__(
|
53
78
|
self,
|
@@ -69,14 +94,15 @@ class ModelConfig:
|
|
69
94
|
self.model_path = model_path
|
70
95
|
self.revision = revision
|
71
96
|
self.quantization = quantization
|
97
|
+
self.is_draft_model = is_draft_model
|
72
98
|
self.model_impl = model_impl
|
73
99
|
|
74
|
-
|
100
|
+
# Get hf config
|
101
|
+
self._maybe_pull_model_tokenizer_from_remote()
|
75
102
|
self.model_override_args = json.loads(model_override_args)
|
76
103
|
kwargs = {}
|
77
104
|
if override_config_file and override_config_file.strip():
|
78
105
|
kwargs["_configuration_file"] = override_config_file.strip()
|
79
|
-
|
80
106
|
self.hf_config = get_config(
|
81
107
|
self.model_path,
|
82
108
|
trust_remote_code=trust_remote_code,
|
@@ -84,7 +110,7 @@ class ModelConfig:
|
|
84
110
|
model_override_args=self.model_override_args,
|
85
111
|
**kwargs,
|
86
112
|
)
|
87
|
-
|
113
|
+
self.hf_text_config = get_hf_text_config(self.hf_config)
|
88
114
|
self.hf_generation_config = get_generation_config(
|
89
115
|
self.model_path,
|
90
116
|
trust_remote_code=trust_remote_code,
|
@@ -92,7 +118,25 @@ class ModelConfig:
|
|
92
118
|
**kwargs,
|
93
119
|
)
|
94
120
|
|
95
|
-
|
121
|
+
# Set enable_multimodal
|
122
|
+
if enable_multimodal is None:
|
123
|
+
mm_disabled_models = [
|
124
|
+
"Gemma3ForConditionalGeneration",
|
125
|
+
"Llama4ForConditionalGeneration",
|
126
|
+
"Step3VLForConditionalGeneration",
|
127
|
+
]
|
128
|
+
if self.hf_config.architectures[0] in mm_disabled_models:
|
129
|
+
enable_multimodal = False
|
130
|
+
logger.info(
|
131
|
+
f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
|
132
|
+
)
|
133
|
+
else:
|
134
|
+
enable_multimodal = True
|
135
|
+
|
136
|
+
# Config draft model
|
137
|
+
self._config_draft_model()
|
138
|
+
|
139
|
+
# Check model type
|
96
140
|
self.attention_chunk_size = getattr(
|
97
141
|
self.hf_text_config, "attention_chunk_size", None
|
98
142
|
)
|
@@ -108,20 +152,70 @@ class ModelConfig:
|
|
108
152
|
self.hf_config.architectures, self.hf_text_config.num_hidden_layers
|
109
153
|
)
|
110
154
|
)
|
155
|
+
self.is_generation = is_generation_model(
|
156
|
+
self.hf_config.architectures, is_embedding
|
157
|
+
)
|
158
|
+
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
159
|
+
self.hf_config.architectures
|
160
|
+
)
|
161
|
+
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
162
|
+
self.hf_config.architectures
|
163
|
+
)
|
164
|
+
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
165
|
+
self.hf_config.architectures
|
166
|
+
)
|
167
|
+
self.is_audio_model = enable_multimodal and is_audio_model(
|
168
|
+
self.hf_config.architectures
|
169
|
+
)
|
170
|
+
self.is_multimodal_chunked_prefill_supported = (
|
171
|
+
enable_multimodal
|
172
|
+
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
173
|
+
)
|
174
|
+
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
175
|
+
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
111
176
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
177
|
+
# Derive context length and model shapes
|
178
|
+
self._derive_context_length(context_length)
|
179
|
+
self._derive_model_shapes()
|
180
|
+
|
181
|
+
# Verify quantization
|
182
|
+
self._verify_quantization()
|
183
|
+
|
184
|
+
# Verify dual-chunk attention config
|
185
|
+
self._verify_dual_chunk_attention_config()
|
186
|
+
|
187
|
+
# Cache attributes
|
188
|
+
self.hf_eos_token_id = self._get_hf_eos_token_id()
|
189
|
+
|
190
|
+
# multimodal
|
191
|
+
self.image_token_id = getattr(
|
192
|
+
self.hf_config, "image_token_id", None
|
193
|
+
) or getattr(self.hf_config, "image_token_index", None)
|
194
|
+
|
195
|
+
@staticmethod
|
196
|
+
def from_server_args(
|
197
|
+
server_args: ServerArgs,
|
198
|
+
model_path: str = None,
|
199
|
+
model_revision: str = None,
|
200
|
+
**kwargs,
|
201
|
+
):
|
202
|
+
return ModelConfig(
|
203
|
+
model_path=model_path or server_args.model_path,
|
204
|
+
trust_remote_code=server_args.trust_remote_code,
|
205
|
+
revision=model_revision or server_args.revision,
|
206
|
+
context_length=server_args.context_length,
|
207
|
+
model_override_args=server_args.json_model_override_args,
|
208
|
+
is_embedding=server_args.is_embedding,
|
209
|
+
enable_multimodal=server_args.enable_multimodal,
|
210
|
+
dtype=server_args.dtype,
|
211
|
+
quantization=server_args.quantization,
|
212
|
+
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
213
|
+
model_impl=server_args.model_impl,
|
214
|
+
**kwargs,
|
215
|
+
)
|
216
|
+
|
217
|
+
def _config_draft_model(self):
|
218
|
+
is_draft_model = self.is_draft_model
|
125
219
|
|
126
220
|
if (
|
127
221
|
is_draft_model
|
@@ -141,37 +235,25 @@ class ModelConfig:
|
|
141
235
|
|
142
236
|
if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
|
143
237
|
self.hf_config.architectures[0] = "MiMoMTP"
|
238
|
+
if is_draft_model and self.hf_config.architectures[0] in [
|
239
|
+
"BailingMoeV2ForCausalLM",
|
240
|
+
"BailingMoeForCausalLM",
|
241
|
+
]:
|
242
|
+
self.hf_config.architectures[0] = "BailingMoeForCausalLMNextN"
|
144
243
|
if (
|
145
244
|
is_draft_model
|
146
245
|
and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
|
147
246
|
):
|
148
247
|
self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
|
149
248
|
|
150
|
-
|
151
|
-
|
152
|
-
self.hf_config.
|
153
|
-
)
|
154
|
-
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
155
|
-
self.hf_config.architectures
|
156
|
-
)
|
157
|
-
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
158
|
-
self.hf_config.architectures
|
159
|
-
)
|
160
|
-
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
161
|
-
self.hf_config.architectures
|
162
|
-
)
|
163
|
-
self.is_audio_model = enable_multimodal and is_audio_model(
|
164
|
-
self.hf_config.architectures
|
165
|
-
)
|
166
|
-
self.is_multimodal_chunked_prefill_supported = (
|
167
|
-
enable_multimodal
|
168
|
-
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
169
|
-
)
|
170
|
-
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
171
|
-
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
249
|
+
if is_draft_model and self.hf_config.architectures[0] == "Qwen3NextForCausalLM":
|
250
|
+
self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
|
251
|
+
self.hf_config.num_nextn_predict_layers = 1
|
172
252
|
|
173
|
-
|
253
|
+
def _derive_context_length(self, context_length: int):
|
254
|
+
is_draft_model = self.is_draft_model
|
174
255
|
derived_context_len = get_context_length(self.hf_text_config)
|
256
|
+
|
175
257
|
if context_length is not None:
|
176
258
|
if context_length > derived_context_len:
|
177
259
|
reason = "Target model's" if is_draft_model else "User-specified"
|
@@ -180,11 +262,16 @@ class ModelConfig:
|
|
180
262
|
f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
|
181
263
|
)
|
182
264
|
if (
|
183
|
-
|
265
|
+
envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
|
184
266
|
or is_in_ci() # FIXME: fix this special case
|
185
267
|
):
|
186
268
|
logger.warning(msg)
|
187
269
|
self.context_len = context_length
|
270
|
+
if is_draft_model:
|
271
|
+
self.hf_text_config.max_position_embeddings = context_length
|
272
|
+
logger.warning(
|
273
|
+
f"Overriding the draft model's max_position_embeddings to {context_length}."
|
274
|
+
)
|
188
275
|
else:
|
189
276
|
raise ValueError(
|
190
277
|
f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
|
@@ -194,6 +281,10 @@ class ModelConfig:
|
|
194
281
|
else:
|
195
282
|
self.context_len = derived_context_len
|
196
283
|
|
284
|
+
# Transfer context_len to HuggingFace config so models can access it
|
285
|
+
self.hf_config.context_len = self.context_len
|
286
|
+
|
287
|
+
def _derive_model_shapes(self):
|
197
288
|
# Unify the config keys for hf_text_config
|
198
289
|
self.head_dim = getattr(
|
199
290
|
self.hf_text_config,
|
@@ -204,10 +295,12 @@ class ModelConfig:
|
|
204
295
|
# FIXME: temporary special judge for MLA architecture
|
205
296
|
if (
|
206
297
|
"DeepseekV2ForCausalLM" in self.hf_config.architectures
|
298
|
+
or "DeepseekV32ForCausalLM" in self.hf_config.architectures
|
207
299
|
or "DeepseekV3ForCausalLM" in self.hf_config.architectures
|
208
300
|
or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
|
209
301
|
or "LongcatFlashForCausalLM" in self.hf_config.architectures
|
210
302
|
or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
|
303
|
+
or "DotsVLMForCausalLM" in self.hf_config.architectures
|
211
304
|
):
|
212
305
|
self.head_dim = 256
|
213
306
|
self.attention_arch = AttentionArch.MLA
|
@@ -215,6 +308,11 @@ class ModelConfig:
|
|
215
308
|
self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
|
216
309
|
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
|
217
310
|
self.v_head_dim = self.hf_config.v_head_dim
|
311
|
+
self.index_head_dim = (
|
312
|
+
get_nsa_index_head_dim(self.hf_config)
|
313
|
+
if is_deepseek_nsa(self.hf_config)
|
314
|
+
else None
|
315
|
+
)
|
218
316
|
|
219
317
|
# Handle rope scaling with yarn
|
220
318
|
self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
|
@@ -287,37 +385,6 @@ class ModelConfig:
|
|
287
385
|
)
|
288
386
|
self.vocab_size = self.hf_text_config.vocab_size
|
289
387
|
|
290
|
-
# Verify quantization
|
291
|
-
self._verify_quantization()
|
292
|
-
|
293
|
-
# Verify dual-chunk attention config
|
294
|
-
self._verify_dual_chunk_attention_config()
|
295
|
-
|
296
|
-
# Cache attributes
|
297
|
-
self.hf_eos_token_id = self.get_hf_eos_token_id()
|
298
|
-
|
299
|
-
# multimodal
|
300
|
-
self.image_token_id = getattr(
|
301
|
-
self.hf_config, "image_token_id", None
|
302
|
-
) or getattr(self.hf_config, "image_token_index", None)
|
303
|
-
|
304
|
-
@staticmethod
|
305
|
-
def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
|
306
|
-
return ModelConfig(
|
307
|
-
model_path=model_path or server_args.model_path,
|
308
|
-
trust_remote_code=server_args.trust_remote_code,
|
309
|
-
revision=server_args.revision,
|
310
|
-
context_length=server_args.context_length,
|
311
|
-
model_override_args=server_args.json_model_override_args,
|
312
|
-
is_embedding=server_args.is_embedding,
|
313
|
-
enable_multimodal=server_args.enable_multimodal,
|
314
|
-
dtype=server_args.dtype,
|
315
|
-
quantization=server_args.quantization,
|
316
|
-
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
317
|
-
model_impl=server_args.model_impl,
|
318
|
-
**kwargs,
|
319
|
-
)
|
320
|
-
|
321
388
|
def get_total_num_attention_heads(self) -> int:
|
322
389
|
return self.num_attention_heads
|
323
390
|
|
@@ -412,11 +479,38 @@ class ModelConfig:
|
|
412
479
|
is_local = os.path.exists(self.model_path)
|
413
480
|
modelopt_quant_config = {"quant_method": "modelopt"}
|
414
481
|
if not is_local:
|
415
|
-
|
482
|
+
import huggingface_hub
|
483
|
+
|
484
|
+
try:
|
485
|
+
from huggingface_hub import HfApi
|
486
|
+
|
487
|
+
hf_api = HfApi()
|
488
|
+
|
489
|
+
def check_hf_quant_config():
|
490
|
+
return hf_api.file_exists(
|
491
|
+
self.model_path, "hf_quant_config.json"
|
492
|
+
)
|
493
|
+
|
494
|
+
# Retry HF API call up to 3 times
|
495
|
+
file_exists = retry(
|
496
|
+
check_hf_quant_config,
|
497
|
+
max_retry=2,
|
498
|
+
initial_delay=1.0,
|
499
|
+
max_delay=5.0,
|
500
|
+
)
|
501
|
+
|
502
|
+
if file_exists:
|
503
|
+
quant_cfg = modelopt_quant_config
|
504
|
+
|
505
|
+
except huggingface_hub.errors.OfflineModeIsEnabled:
|
506
|
+
logger.warning(
|
507
|
+
"Offline mode is enabled, skipping hf_quant_config.json check"
|
508
|
+
)
|
509
|
+
except Exception as e:
|
510
|
+
logger.warning(
|
511
|
+
f"Failed to check hf_quant_config.json: {self.model_path} {e}"
|
512
|
+
)
|
416
513
|
|
417
|
-
hf_api = HfApi()
|
418
|
-
if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
|
419
|
-
quant_cfg = modelopt_quant_config
|
420
514
|
elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
|
421
515
|
quant_config_file = os.path.join(
|
422
516
|
self.model_path, "hf_quant_config.json"
|
@@ -543,7 +637,7 @@ class ModelConfig:
|
|
543
637
|
"sparse_attention_enabled"
|
544
638
|
] = True
|
545
639
|
|
546
|
-
def
|
640
|
+
def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
|
547
641
|
eos_ids = getattr(self.hf_config, "eos_token_id", None)
|
548
642
|
if eos_ids is not None:
|
549
643
|
# it can be either int or list of int
|
@@ -563,7 +657,7 @@ class ModelConfig:
|
|
563
657
|
eos_ids = eos_ids | generation_eos_ids
|
564
658
|
return eos_ids
|
565
659
|
|
566
|
-
def
|
660
|
+
def _maybe_pull_model_tokenizer_from_remote(self) -> None:
|
567
661
|
"""
|
568
662
|
Pull the model config files to a temporary
|
569
663
|
directory in case of remote.
|
@@ -706,12 +800,17 @@ multimodal_model_archs = [
|
|
706
800
|
"Qwen2AudioForConditionalGeneration",
|
707
801
|
"Qwen2VLForConditionalGeneration",
|
708
802
|
"Qwen2_5_VLForConditionalGeneration",
|
803
|
+
"Qwen3VLForConditionalGeneration",
|
804
|
+
"Qwen3VLMoeForConditionalGeneration",
|
709
805
|
"KimiVLForConditionalGeneration",
|
710
806
|
"InternVLChatModel",
|
711
807
|
"InternS1ForConditionalGeneration",
|
712
808
|
"Phi4MMForCausalLM",
|
713
809
|
"VILAForConditionalGeneration",
|
714
810
|
"Step3VLForConditionalGeneration",
|
811
|
+
"DotsVLMForCausalLM",
|
812
|
+
"DotsOCRForCausalLM",
|
813
|
+
"Sarashina2VisionForCausalLM",
|
715
814
|
]
|
716
815
|
|
717
816
|
|