sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +192 -113
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +132 -57
- sglang/srt/entrypoints/openai/protocol.py +115 -7
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +207 -58
- sglang/srt/entrypoints/openai/serving_completions.py +17 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +49 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +106 -82
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +53 -7
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +225 -57
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +78 -49
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +215 -314
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +147 -19
- sglang/srt/managers/scheduler.py +501 -304
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +321 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +15 -21
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +58 -34
- sglang/srt/mem_cache/hiradix_cache.py +227 -80
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -223
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +519 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +55 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +98 -57
- sglang/srt/model_executor/model_runner.py +433 -158
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +833 -152
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +14 -5
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +124 -14
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +26 -5
- sglang/srt/models/qwen3_moe.py +71 -12
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +10 -3
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1030 -254
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +253 -136
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +445 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +22 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING
|
4
|
-
|
5
3
|
import torch
|
6
4
|
|
7
5
|
from sglang.srt.mem_cache.allocator import PagedTokenToKVPoolAllocator
|
8
|
-
|
9
|
-
if TYPE_CHECKING:
|
10
|
-
from sglang.srt.mem_cache.memory_pool import KVCache
|
6
|
+
from sglang.srt.utils import get_num_new_pages
|
11
7
|
|
12
8
|
|
13
9
|
def alloc_extend_kernel_ascend(
|
@@ -69,7 +65,9 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
|
|
69
65
|
def alloc_extend(
|
70
66
|
self,
|
71
67
|
prefix_lens: torch.Tensor,
|
68
|
+
prefix_lens_cpu: torch.Tensor,
|
72
69
|
seq_lens: torch.Tensor,
|
70
|
+
seq_lens_cpu: torch.Tensor,
|
73
71
|
last_loc: torch.Tensor,
|
74
72
|
extend_num_tokens: int,
|
75
73
|
):
|
@@ -79,42 +77,54 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
|
|
79
77
|
)
|
80
78
|
|
81
79
|
num_new_pages = (
|
82
|
-
(
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
.item()
|
88
|
-
)
|
89
|
-
if self.need_sort and num_new_pages > len(self.free_pages):
|
80
|
+
(seq_lens + self.page_size - 1) // self.page_size
|
81
|
+
- (prefix_lens + self.page_size - 1) // self.page_size
|
82
|
+
).sum()
|
83
|
+
num_new_pages_item = num_new_pages.item()
|
84
|
+
if self.need_sort and num_new_pages_item > len(self.free_pages):
|
90
85
|
self.merge_and_sort_free()
|
91
86
|
|
92
|
-
if
|
87
|
+
if num_new_pages_item > len(self.free_pages):
|
93
88
|
return None
|
94
89
|
|
95
90
|
out_indices = torch.empty(
|
96
|
-
(extend_num_tokens,), dtype=torch.
|
91
|
+
(extend_num_tokens,), dtype=torch.int64, device=self.device
|
97
92
|
)
|
98
93
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
94
|
+
if num_new_pages_item < 200:
|
95
|
+
import sgl_kernel_npu
|
96
|
+
|
97
|
+
torch.ops.npu.alloc_extend(
|
98
|
+
prefix_lens,
|
99
|
+
seq_lens,
|
100
|
+
last_loc,
|
101
|
+
self.free_pages,
|
102
|
+
self.page_size,
|
103
|
+
out_indices,
|
104
|
+
num_new_pages,
|
105
|
+
)
|
106
|
+
|
107
|
+
else:
|
108
|
+
alloc_extend_kernel_ascend(
|
109
|
+
prefix_lens,
|
110
|
+
seq_lens,
|
111
|
+
last_loc,
|
112
|
+
self.free_pages,
|
113
|
+
out_indices,
|
114
|
+
self.page_size,
|
115
|
+
self.device,
|
116
|
+
)
|
108
117
|
|
109
118
|
if self.debug_mode:
|
110
119
|
assert len(torch.unique(out_indices)) == len(out_indices)
|
111
120
|
|
112
|
-
self.free_pages = self.free_pages[
|
121
|
+
self.free_pages = self.free_pages[num_new_pages_item:]
|
113
122
|
return out_indices
|
114
123
|
|
115
124
|
def alloc_decode(
|
116
125
|
self,
|
117
126
|
seq_lens: torch.Tensor,
|
127
|
+
seq_lens_cpu: torch.Tensor,
|
118
128
|
last_loc: torch.Tensor,
|
119
129
|
):
|
120
130
|
if self.debug_mode:
|
@@ -122,8 +132,11 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
|
|
122
132
|
(last_loc + 2) % self.page_size == seq_lens % self.page_size
|
123
133
|
)
|
124
134
|
|
125
|
-
|
126
|
-
|
135
|
+
num_new_pages = get_num_new_pages(
|
136
|
+
seq_lens=seq_lens_cpu,
|
137
|
+
page_size=self.page_size,
|
138
|
+
decode=True,
|
139
|
+
)
|
127
140
|
|
128
141
|
if num_new_pages > len(self.free_pages):
|
129
142
|
self.merge_and_sort_free()
|
@@ -131,6 +144,7 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
|
|
131
144
|
if num_new_pages > len(self.free_pages):
|
132
145
|
return None
|
133
146
|
|
147
|
+
need_new_pages = (seq_lens % self.page_size == 1).int()
|
134
148
|
end_new_pages = torch.cumsum(need_new_pages, 0)
|
135
149
|
start_new_pages = end_new_pages - need_new_pages
|
136
150
|
if num_new_pages == 0:
|
@@ -28,6 +28,13 @@ class ChunkCache(BasePrefixCache):
|
|
28
28
|
self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
|
29
29
|
self.page_size = page_size
|
30
30
|
|
31
|
+
# NOTE (csy): this is to determine if a cache has prefix matching feature.
|
32
|
+
# Chunk cache always return True to indicate no prefix matching.
|
33
|
+
# TODO (csy): Using a prefix cache trait to replace this
|
34
|
+
@property
|
35
|
+
def disable(self):
|
36
|
+
return True
|
37
|
+
|
31
38
|
def reset(self):
|
32
39
|
pass
|
33
40
|
|
@@ -38,7 +45,7 @@ class ChunkCache(BasePrefixCache):
|
|
38
45
|
last_host_node=None,
|
39
46
|
)
|
40
47
|
|
41
|
-
def cache_finished_req(self, req: Req):
|
48
|
+
def cache_finished_req(self, req: Req, insert: bool = True):
|
42
49
|
kv_indices = self.req_to_token_pool.req_to_token[
|
43
50
|
req.req_pool_idx,
|
44
51
|
# For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import TYPE_CHECKING, List, Tuple, Union
|
5
|
+
|
6
|
+
if TYPE_CHECKING:
|
7
|
+
from sglang.srt.mem_cache.radix_cache import TreeNode
|
8
|
+
|
9
|
+
|
10
|
+
class EvictionStrategy(ABC):
|
11
|
+
@abstractmethod
|
12
|
+
def get_priority(self, node: "TreeNode") -> Union[float, Tuple]:
|
13
|
+
pass
|
14
|
+
|
15
|
+
|
16
|
+
class LRUStrategy(EvictionStrategy):
|
17
|
+
def get_priority(self, node: "TreeNode") -> float:
|
18
|
+
return node.last_access_time
|
19
|
+
|
20
|
+
|
21
|
+
class LFUStrategy(EvictionStrategy):
|
22
|
+
def get_priority(self, node: "TreeNode") -> Tuple[int, float]:
|
23
|
+
return (node.hit_count, node.last_access_time)
|
@@ -7,6 +7,8 @@ from typing import Any, List, Optional
|
|
7
7
|
|
8
8
|
import torch
|
9
9
|
|
10
|
+
from sglang.srt.mem_cache.memory_pool_host import HostKVCache
|
11
|
+
|
10
12
|
logger = logging.getLogger(__name__)
|
11
13
|
|
12
14
|
|
@@ -27,19 +29,51 @@ class HiCacheStorageConfig:
|
|
27
29
|
tp_rank: int
|
28
30
|
tp_size: int
|
29
31
|
is_mla_model: bool
|
32
|
+
is_page_first_layout: bool
|
30
33
|
model_name: Optional[str]
|
31
34
|
extra_config: Optional[dict] = None
|
32
35
|
|
33
36
|
|
37
|
+
@dataclass
|
38
|
+
class HiCacheStorageExtraInfo:
|
39
|
+
extra_info: Optional[dict] = None
|
40
|
+
|
41
|
+
|
34
42
|
class HiCacheStorage(ABC):
|
35
43
|
"""
|
36
44
|
HiCacheStorage is a class that provides a generic key-value interface for storing and retrieving KV cache.
|
37
45
|
It abstracts the underlying storage mechanism, allowing different implementations to be used.
|
38
46
|
"""
|
39
47
|
|
40
|
-
# todo, potentially pass model and TP configs into storage backend
|
41
48
|
# todo, the page size of storage backend does not have to be the same as the same as host memory pool
|
42
49
|
|
50
|
+
def register_mem_pool_host(self, mem_pool_host: HostKVCache):
|
51
|
+
self.mem_pool_host = mem_pool_host
|
52
|
+
|
53
|
+
def batch_get_v1(
|
54
|
+
self,
|
55
|
+
keys: List[str],
|
56
|
+
host_indices: torch.Tensor,
|
57
|
+
extra_info: Optional[HiCacheStorageExtraInfo] = None,
|
58
|
+
) -> List[bool]:
|
59
|
+
"""
|
60
|
+
Retrieve values for multiple keys.
|
61
|
+
Returns a list of tensors or None for each key.
|
62
|
+
"""
|
63
|
+
pass
|
64
|
+
|
65
|
+
def batch_set_v1(
|
66
|
+
self,
|
67
|
+
keys: List[str],
|
68
|
+
host_indices: torch.Tensor,
|
69
|
+
extra_info: Optional[HiCacheStorageExtraInfo] = None,
|
70
|
+
) -> List[bool]:
|
71
|
+
"""
|
72
|
+
Retrieve values for multiple keys.
|
73
|
+
Returns a list of tensors or None for each key.
|
74
|
+
"""
|
75
|
+
pass
|
76
|
+
|
43
77
|
@abstractmethod
|
44
78
|
def get(
|
45
79
|
self,
|
@@ -53,6 +87,7 @@ class HiCacheStorage(ABC):
|
|
53
87
|
"""
|
54
88
|
pass
|
55
89
|
|
90
|
+
# TODO: Deprecate
|
56
91
|
@abstractmethod
|
57
92
|
def batch_get(
|
58
93
|
self,
|
@@ -80,6 +115,7 @@ class HiCacheStorage(ABC):
|
|
80
115
|
"""
|
81
116
|
pass
|
82
117
|
|
118
|
+
# TODO: Deprecate
|
83
119
|
@abstractmethod
|
84
120
|
def batch_set(
|
85
121
|
self,
|
@@ -102,20 +138,7 @@ class HiCacheStorage(ABC):
|
|
102
138
|
"""
|
103
139
|
pass
|
104
140
|
|
105
|
-
|
106
|
-
def delete(self, key: str) -> bool:
|
107
|
-
"""
|
108
|
-
Delete the entry associated with the given key.
|
109
|
-
"""
|
110
|
-
pass
|
111
|
-
|
112
|
-
@abstractmethod
|
113
|
-
def clear(self) -> bool:
|
114
|
-
"""
|
115
|
-
Clear all entries in the storage.
|
116
|
-
"""
|
117
|
-
pass
|
118
|
-
|
141
|
+
# TODO: Use a finer-grained return type (e.g., List[bool])
|
119
142
|
def batch_exists(self, keys: List[str]) -> int:
|
120
143
|
"""
|
121
144
|
Check if the keys exist in the storage.
|
@@ -127,6 +150,12 @@ class HiCacheStorage(ABC):
|
|
127
150
|
return i
|
128
151
|
return len(keys)
|
129
152
|
|
153
|
+
def clear(self) -> None:
|
154
|
+
pass
|
155
|
+
|
156
|
+
def get_stats(self):
|
157
|
+
return None
|
158
|
+
|
130
159
|
|
131
160
|
class HiCacheFile(HiCacheStorage):
|
132
161
|
|
@@ -135,18 +164,24 @@ class HiCacheFile(HiCacheStorage):
|
|
135
164
|
):
|
136
165
|
self.file_path = os.getenv("SGLANG_HICACHE_FILE_BACKEND_STORAGE_DIR", file_path)
|
137
166
|
|
138
|
-
tp_rank, tp_size,
|
167
|
+
tp_rank, tp_size, model_name, is_mla_model = (
|
139
168
|
storage_config.tp_rank,
|
140
169
|
storage_config.tp_size,
|
170
|
+
storage_config.model_name,
|
141
171
|
storage_config.is_mla_model,
|
142
172
|
)
|
143
|
-
|
173
|
+
model_name = "-".join(model_name.split("/")) if model_name else ""
|
174
|
+
if is_mla_model:
|
175
|
+
self.config_suffix = f"_{model_name}"
|
176
|
+
else:
|
177
|
+
self.config_suffix = f"_{model_name}_{tp_rank}_{tp_size}"
|
178
|
+
|
144
179
|
if not os.path.exists(self.file_path) and tp_rank == 0:
|
145
180
|
os.makedirs(self.file_path)
|
146
181
|
logger.info(f"Created HiCacheFile storage directory at {self.file_path}")
|
147
182
|
|
148
183
|
def _get_suffixed_key(self, key: str) -> str:
|
149
|
-
return key + self.
|
184
|
+
return key + self.config_suffix
|
150
185
|
|
151
186
|
def get(
|
152
187
|
self,
|
@@ -157,13 +192,11 @@ class HiCacheFile(HiCacheStorage):
|
|
157
192
|
key = self._get_suffixed_key(key)
|
158
193
|
tensor_path = os.path.join(self.file_path, f"{key}.bin")
|
159
194
|
try:
|
160
|
-
|
161
|
-
with open(tensor_path, "rb") as f:
|
162
|
-
target_location.
|
163
|
-
|
164
|
-
|
165
|
-
.untyped_storage()
|
166
|
-
)
|
195
|
+
expected = target_location.numel() * target_location.element_size()
|
196
|
+
with open(tensor_path, "rb", buffering=0) as f:
|
197
|
+
buf = memoryview(target_location.view(torch.uint8).contiguous().numpy())
|
198
|
+
if f.readinto(buf) != expected:
|
199
|
+
raise IOError(f"Short read for {key}")
|
167
200
|
return target_location
|
168
201
|
except FileNotFoundError:
|
169
202
|
logger.warning(f"Failed to fetch {key} from HiCacheFile storage.")
|
@@ -219,15 +252,6 @@ class HiCacheFile(HiCacheStorage):
|
|
219
252
|
tensor_path = os.path.join(self.file_path, f"{key}.bin")
|
220
253
|
return os.path.exists(tensor_path)
|
221
254
|
|
222
|
-
def delete(self, key: str) -> None:
|
223
|
-
key = self._get_suffixed_key(key)
|
224
|
-
tensor_path = os.path.join(self.file_path, f"{key}.bin")
|
225
|
-
try:
|
226
|
-
os.remove(tensor_path)
|
227
|
-
except FileNotFoundError:
|
228
|
-
logger.warning(f"Key {key} does not exist. Cannot delete.")
|
229
|
-
return
|
230
|
-
|
231
255
|
def clear(self) -> bool:
|
232
256
|
try:
|
233
257
|
for filename in os.listdir(self.file_path):
|