sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +330 -156
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +8 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +134 -23
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +70 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +66 -66
- sglang/srt/entrypoints/grpc_server.py +431 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +120 -8
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +42 -4
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +18 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +4 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +12 -8
- sglang/srt/layers/attention/flashinfer_backend.py +248 -21
- sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +11 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +45 -15
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +147 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
- sglang/srt/layers/moe/ep_moe/layer.py +119 -397
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +17 -1
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +5 -30
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +673 -16
- sglang/srt/layers/sampler.py +36 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +66 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +399 -499
- sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +378 -90
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +4 -1
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +435 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +28 -23
- sglang/srt/model_executor/model_runner.py +379 -139
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +273 -98
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +14 -37
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +5 -5
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3_moe.py +19 -35
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +6 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +577 -73
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +38 -28
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +192 -47
- sglang/srt/utils/hf_transformers_utils.py +40 -17
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +232 -99
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -27,7 +27,7 @@ from transformers import LlamaConfig
|
|
|
27
27
|
|
|
28
28
|
from sglang.srt.distributed import get_pp_group
|
|
29
29
|
from sglang.srt.layers.layernorm import RMSNorm
|
|
30
|
-
from sglang.srt.layers.linear import QKVParallelLinear
|
|
30
|
+
from sglang.srt.layers.linear import QKVParallelLinear
|
|
31
31
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
32
32
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
33
33
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
|
@@ -32,14 +32,10 @@
|
|
|
32
32
|
|
|
33
33
|
import concurrent.futures
|
|
34
34
|
import logging
|
|
35
|
-
import
|
|
36
|
-
from enum import IntEnum, auto
|
|
37
|
-
from typing import Any, Dict, Iterable, Optional, Tuple, Union
|
|
35
|
+
from typing import Iterable, Optional, Tuple
|
|
38
36
|
|
|
39
37
|
import torch
|
|
40
|
-
import torch.nn.functional as F
|
|
41
38
|
from torch import nn
|
|
42
|
-
from tqdm import tqdm
|
|
43
39
|
|
|
44
40
|
from sglang.srt.configs import LongcatFlashConfig
|
|
45
41
|
from sglang.srt.distributed import (
|
|
@@ -48,9 +44,8 @@ from sglang.srt.distributed import (
|
|
|
48
44
|
)
|
|
49
45
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
|
50
46
|
from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
|
|
51
|
-
from sglang.srt.
|
|
47
|
+
from sglang.srt.layers import deep_gemm_wrapper
|
|
52
48
|
from sglang.srt.layers.activation import SiluAndMul
|
|
53
|
-
from sglang.srt.layers.amx_utils import PackWeightMethod
|
|
54
49
|
from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
|
|
55
50
|
from sglang.srt.layers.dp_attention import (
|
|
56
51
|
get_attention_tp_rank,
|
|
@@ -68,7 +63,6 @@ from sglang.srt.layers.moe.ep_moe.kernels import zero_experts_compute_triton
|
|
|
68
63
|
from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
|
|
69
64
|
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
|
|
70
65
|
from sglang.srt.layers.moe.topk import StandardTopKOutput, TopK
|
|
71
|
-
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
|
72
66
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
73
67
|
from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
|
|
74
68
|
from sglang.srt.layers.quantization.fp8_utils import (
|
|
@@ -85,26 +79,21 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
|
85
79
|
ParallelLMHead,
|
|
86
80
|
VocabParallelEmbedding,
|
|
87
81
|
)
|
|
88
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
89
82
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
|
90
83
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
91
84
|
from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
|
|
85
|
+
from sglang.srt.server_args import get_global_server_args
|
|
92
86
|
from sglang.srt.utils import (
|
|
93
87
|
BumpAllocator,
|
|
94
|
-
LazyValue,
|
|
95
88
|
add_prefix,
|
|
96
89
|
bind_or_assign,
|
|
97
90
|
cpu_has_amx_support,
|
|
98
91
|
get_bool_env_var,
|
|
99
92
|
get_device_sm,
|
|
100
|
-
get_int_env_var,
|
|
101
93
|
is_cpu,
|
|
102
94
|
is_cuda,
|
|
103
|
-
is_flashinfer_available,
|
|
104
95
|
is_hip,
|
|
105
|
-
is_non_idle_and_non_empty,
|
|
106
96
|
is_npu,
|
|
107
|
-
is_sm100_supported,
|
|
108
97
|
)
|
|
109
98
|
|
|
110
99
|
_is_hip = is_hip()
|
|
@@ -117,13 +106,7 @@ _is_cpu = is_cpu()
|
|
|
117
106
|
_device_sm = get_device_sm()
|
|
118
107
|
|
|
119
108
|
if _is_cuda:
|
|
120
|
-
from sgl_kernel import
|
|
121
|
-
awq_dequantize,
|
|
122
|
-
bmm_fp8,
|
|
123
|
-
dsv3_fused_a_gemm,
|
|
124
|
-
dsv3_router_gemm,
|
|
125
|
-
merge_state_v2,
|
|
126
|
-
)
|
|
109
|
+
from sgl_kernel import awq_dequantize
|
|
127
110
|
elif _is_cpu and _is_cpu_amx_available:
|
|
128
111
|
pass
|
|
129
112
|
elif _is_hip:
|
|
@@ -595,7 +578,7 @@ class LongcatFlashForCausalLM(nn.Module):
|
|
|
595
578
|
config.hidden_size,
|
|
596
579
|
quant_config=quant_config,
|
|
597
580
|
prefix=add_prefix("lm_head", prefix),
|
|
598
|
-
use_attn_tp_group=
|
|
581
|
+
use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
|
|
599
582
|
)
|
|
600
583
|
self.logits_processor = LogitsProcessor(config)
|
|
601
584
|
|
|
@@ -32,17 +32,14 @@
|
|
|
32
32
|
|
|
33
33
|
import concurrent.futures
|
|
34
34
|
import logging
|
|
35
|
-
import
|
|
36
|
-
from enum import IntEnum, auto
|
|
37
|
-
from typing import Any, Dict, Iterable, Optional, Tuple, Union
|
|
35
|
+
from typing import Iterable, Optional, Tuple
|
|
38
36
|
|
|
39
37
|
import torch
|
|
40
|
-
import torch.nn.functional as F
|
|
41
38
|
from torch import nn
|
|
42
|
-
from tqdm import tqdm
|
|
43
39
|
|
|
44
40
|
from sglang.srt.configs import LongcatFlashConfig
|
|
45
41
|
from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
|
|
42
|
+
from sglang.srt.layers import deep_gemm_wrapper
|
|
46
43
|
from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
|
|
47
44
|
from sglang.srt.layers.dp_attention import (
|
|
48
45
|
get_attention_tp_rank,
|
|
@@ -52,7 +49,6 @@ from sglang.srt.layers.dp_attention import (
|
|
|
52
49
|
from sglang.srt.layers.layernorm import RMSNorm
|
|
53
50
|
from sglang.srt.layers.linear import ReplicatedLinear
|
|
54
51
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
55
|
-
from sglang.srt.layers.quantization import deep_gemm_wrapper
|
|
56
52
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
57
53
|
from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
|
|
58
54
|
from sglang.srt.layers.quantization.fp8_utils import (
|
|
@@ -75,7 +71,6 @@ from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
|
|
|
75
71
|
from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP
|
|
76
72
|
from sglang.srt.utils import (
|
|
77
73
|
BumpAllocator,
|
|
78
|
-
LazyValue,
|
|
79
74
|
add_prefix,
|
|
80
75
|
bind_or_assign,
|
|
81
76
|
cpu_has_amx_support,
|
|
@@ -97,13 +92,7 @@ _is_cpu = is_cpu()
|
|
|
97
92
|
_device_sm = get_device_sm()
|
|
98
93
|
|
|
99
94
|
if _is_cuda:
|
|
100
|
-
from sgl_kernel import
|
|
101
|
-
awq_dequantize,
|
|
102
|
-
bmm_fp8,
|
|
103
|
-
dsv3_fused_a_gemm,
|
|
104
|
-
dsv3_router_gemm,
|
|
105
|
-
merge_state_v2,
|
|
106
|
-
)
|
|
95
|
+
from sgl_kernel import awq_dequantize
|
|
107
96
|
elif _is_cpu and _is_cpu_amx_available:
|
|
108
97
|
pass
|
|
109
98
|
elif _is_hip:
|
sglang/srt/models/mimo.py
CHANGED
|
@@ -1,28 +1,17 @@
|
|
|
1
1
|
# Adapted from qwen2.py
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, Dict, Iterable, Optional, Tuple
|
|
3
|
+
from typing import Iterable, Optional, Tuple
|
|
5
4
|
|
|
6
5
|
import torch
|
|
7
6
|
from torch import nn
|
|
8
7
|
|
|
9
|
-
from sglang.srt.distributed import (
|
|
10
|
-
get_tensor_model_parallel_rank,
|
|
11
|
-
get_tensor_model_parallel_world_size,
|
|
12
|
-
split_tensor_along_last_dim,
|
|
13
|
-
tensor_model_parallel_all_gather,
|
|
14
|
-
)
|
|
15
|
-
from sglang.srt.layers.layernorm import RMSNorm
|
|
16
|
-
from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
|
|
17
8
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
18
9
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
|
19
10
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
20
|
-
from sglang.srt.layers.radix_attention import RadixAttention
|
|
21
|
-
from sglang.srt.layers.rotary_embedding import get_rope
|
|
22
11
|
from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
|
|
23
12
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
|
24
13
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
25
|
-
from sglang.srt.models.qwen2 import Qwen2DecoderLayer,
|
|
14
|
+
from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2Model
|
|
26
15
|
from sglang.srt.utils import add_prefix
|
|
27
16
|
|
|
28
17
|
MiMoConfig = None
|
sglang/srt/models/mimo_mtp.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Adapted from https://github.com/vllm-project/vllm/pull/17433/files and deepseek_nextn.py
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, Dict, Iterable, Optional, Tuple
|
|
3
|
+
from typing import Iterable, Optional, Tuple
|
|
5
4
|
|
|
6
5
|
import torch
|
|
7
6
|
from torch import nn
|
sglang/srt/models/minicpmo.py
CHANGED
|
@@ -43,7 +43,6 @@ from sglang.srt.managers.mm_utils import (
|
|
|
43
43
|
general_mm_embed_routine,
|
|
44
44
|
)
|
|
45
45
|
from sglang.srt.managers.schedule_batch import (
|
|
46
|
-
Modality,
|
|
47
46
|
MultimodalDataItem,
|
|
48
47
|
MultimodalInputs,
|
|
49
48
|
flatten_nested_list,
|
|
@@ -59,8 +58,6 @@ from sglang.srt.utils import logger
|
|
|
59
58
|
try:
|
|
60
59
|
from transformers import LogitsWarper
|
|
61
60
|
from vector_quantize_pytorch import GroupedResidualFSQ
|
|
62
|
-
from vocos import Vocos
|
|
63
|
-
from vocos.pretrained import instantiate_class
|
|
64
61
|
|
|
65
62
|
_tts_deps = True
|
|
66
63
|
except:
|
|
@@ -795,8 +792,10 @@ class ConditionalChatTTS(PreTrainedModel):
|
|
|
795
792
|
force_no_stop=False,
|
|
796
793
|
min_new_token=10,
|
|
797
794
|
max_new_token=50,
|
|
798
|
-
logits_warpers: List[LogitsWarper] =
|
|
799
|
-
logits_processors:
|
|
795
|
+
logits_warpers: Optional[List[LogitsWarper]] = None,
|
|
796
|
+
logits_processors: Optional[
|
|
797
|
+
List[CustomRepetitionPenaltyLogitsProcessorRepeat]
|
|
798
|
+
] = None,
|
|
800
799
|
show_tqdm=False,
|
|
801
800
|
):
|
|
802
801
|
"""Generate audio codes in streaming setting or non-streaming setting.
|
|
@@ -825,6 +824,9 @@ class ConditionalChatTTS(PreTrainedModel):
|
|
|
825
824
|
assert input_ids.shape[0] == 1
|
|
826
825
|
assert past_key_values is not None
|
|
827
826
|
|
|
827
|
+
logits_warpers = logits_warpers or []
|
|
828
|
+
logits_processors = logits_processors or []
|
|
829
|
+
|
|
828
830
|
# fix: this should not be `input_ids.shape[1]`
|
|
829
831
|
# start_idx = input_ids.shape[1]
|
|
830
832
|
start_idx = (
|
sglang/srt/models/mixtral.py
CHANGED
|
@@ -24,7 +24,6 @@ from torch import nn
|
|
|
24
24
|
from transformers import MixtralConfig
|
|
25
25
|
|
|
26
26
|
from sglang.srt.distributed import (
|
|
27
|
-
get_moe_expert_parallel_world_size,
|
|
28
27
|
get_pp_group,
|
|
29
28
|
get_tensor_model_parallel_world_size,
|
|
30
29
|
tensor_model_parallel_all_reduce,
|
|
@@ -36,7 +35,6 @@ from sglang.srt.layers.linear import (
|
|
|
36
35
|
RowParallelLinear,
|
|
37
36
|
)
|
|
38
37
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
39
|
-
from sglang.srt.layers.moe.ep_moe.layer import EPMoE
|
|
40
38
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
|
41
39
|
from sglang.srt.layers.moe.topk import TopK
|
|
42
40
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
@@ -94,8 +92,7 @@ class MixtralMoE(nn.Module):
|
|
|
94
92
|
renormalize=True,
|
|
95
93
|
)
|
|
96
94
|
|
|
97
|
-
|
|
98
|
-
self.experts = MoEImpl(
|
|
95
|
+
self.experts = FusedMoE(
|
|
99
96
|
num_experts=num_experts,
|
|
100
97
|
top_k=top_k,
|
|
101
98
|
layer_id=layer_id,
|
sglang/srt/models/mllama.py
CHANGED
|
@@ -901,7 +901,7 @@ class MllamaForConditionalGeneration(nn.Module):
|
|
|
901
901
|
img = pixel_values[0, j]
|
|
902
902
|
num_tiles = img.shape[0]
|
|
903
903
|
batched_images[i, j, :num_tiles] = img
|
|
904
|
-
batched_ar_ids[i, j] = mm_input.mm_items[0].
|
|
904
|
+
batched_ar_ids[i, j] = mm_input.mm_items[0].aspect_ratio_ids[0, j]
|
|
905
905
|
|
|
906
906
|
batched_ar_mask[i, j, :num_tiles] = mm_input.mm_items[
|
|
907
907
|
0
|
sglang/srt/models/mllama4.py
CHANGED
|
@@ -2,6 +2,7 @@ import json as json_lib
|
|
|
2
2
|
import logging
|
|
3
3
|
import math
|
|
4
4
|
import os
|
|
5
|
+
import re
|
|
5
6
|
from collections.abc import Iterable
|
|
6
7
|
from typing import List, Optional, Set, Tuple
|
|
7
8
|
|
|
@@ -30,9 +31,9 @@ from sglang.srt.managers.schedule_batch import (
|
|
|
30
31
|
Modality,
|
|
31
32
|
MultimodalDataItem,
|
|
32
33
|
MultimodalInputs,
|
|
33
|
-
global_server_args_dict,
|
|
34
34
|
)
|
|
35
35
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
|
36
|
+
from sglang.srt.server_args import get_global_server_args
|
|
36
37
|
from sglang.srt.utils import is_cpu
|
|
37
38
|
|
|
38
39
|
_is_cpu = is_cpu()
|
|
@@ -422,6 +423,11 @@ class Llama4ForConditionalGeneration(nn.Module):
|
|
|
422
423
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
423
424
|
}
|
|
424
425
|
|
|
426
|
+
# Pattern to match language model layers only (skip vision_model and multi_modal_projector)
|
|
427
|
+
lora_pattern = re.compile(
|
|
428
|
+
r"^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
|
|
429
|
+
)
|
|
430
|
+
|
|
425
431
|
def __init__(
|
|
426
432
|
self,
|
|
427
433
|
config: Llama4Config,
|
|
@@ -442,7 +448,7 @@ class Llama4ForConditionalGeneration(nn.Module):
|
|
|
442
448
|
)
|
|
443
449
|
|
|
444
450
|
self.has_vision = (
|
|
445
|
-
self.has_vision_weights and
|
|
451
|
+
self.has_vision_weights and get_global_server_args().enable_multimodal
|
|
446
452
|
)
|
|
447
453
|
|
|
448
454
|
if self.has_vision:
|
|
@@ -555,6 +561,10 @@ class Llama4ForConditionalGeneration(nn.Module):
|
|
|
555
561
|
|
|
556
562
|
return projected_vision_flat
|
|
557
563
|
|
|
564
|
+
def should_apply_lora(self, module_name: str) -> bool:
|
|
565
|
+
"""Skip vision model and multi_modal_projector for LoRA."""
|
|
566
|
+
return bool(self.lora_pattern.match(module_name))
|
|
567
|
+
|
|
558
568
|
def forward(
|
|
559
569
|
self,
|
|
560
570
|
input_ids: torch.Tensor,
|
|
@@ -700,7 +710,7 @@ class Llama4ForConditionalGeneration(nn.Module):
|
|
|
700
710
|
"""Handle scale parameter remapping. Returns True if handled."""
|
|
701
711
|
if "scale" in name and "expert" not in name:
|
|
702
712
|
remapped_name = maybe_remap_kv_scale_name(name, params_dict)
|
|
703
|
-
return remapped_name
|
|
713
|
+
return remapped_name != name
|
|
704
714
|
return False
|
|
705
715
|
|
|
706
716
|
def _handle_stacked_params(
|