sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
- sglang/bench_one_batch_server.py +41 -25
- sglang/bench_serving.py +330 -156
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +8 -15
- sglang/profiler.py +18 -1
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +13 -64
- sglang/srt/configs/load_config.py +25 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +134 -23
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +0 -10
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +20 -11
- sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +4 -2
- sglang/srt/disaggregation/decode.py +123 -31
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +157 -19
- sglang/srt/disaggregation/nixl/conn.py +69 -24
- sglang/srt/disaggregation/prefill.py +96 -270
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +70 -19
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +66 -66
- sglang/srt/entrypoints/grpc_server.py +431 -234
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +120 -8
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +225 -37
- sglang/srt/entrypoints/openai/serving_base.py +49 -2
- sglang/srt/entrypoints/openai/serving_chat.py +29 -74
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +15 -1
- sglang/srt/entrypoints/openai/serving_responses.py +5 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +42 -4
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +18 -14
- sglang/srt/function_call/glm4_moe_detector.py +1 -5
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/json_array_parser.py +0 -2
- sglang/srt/function_call/utils.py +2 -2
- sglang/srt/grpc/compile_proto.py +3 -3
- sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
- sglang/srt/layers/activation.py +4 -1
- sglang/srt/layers/attention/aiter_backend.py +3 -3
- sglang/srt/layers/attention/ascend_backend.py +17 -1
- sglang/srt/layers/attention/attention_registry.py +43 -23
- sglang/srt/layers/attention/base_attn_backend.py +20 -1
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +12 -8
- sglang/srt/layers/attention/flashinfer_backend.py +248 -21
- sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
- sglang/srt/layers/attention/flashmla_backend.py +2 -2
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
- sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +0 -1
- sglang/srt/layers/attention/nsa_backend.py +404 -90
- sglang/srt/layers/attention/triton_backend.py +208 -34
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
- sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +3 -3
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +11 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +17 -0
- sglang/srt/layers/layernorm.py +45 -15
- sglang/srt/layers/linear.py +9 -1
- sglang/srt/layers/logits_processor.py +147 -17
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
- sglang/srt/layers/moe/ep_moe/layer.py +119 -397
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +17 -1
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +42 -14
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +125 -100
- sglang/srt/layers/quantization/mxfp4.py +5 -30
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -20
- sglang/srt/layers/quantization/w8a8_int8.py +30 -24
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +673 -16
- sglang/srt/layers/sampler.py +36 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +0 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/triton_backend.py +0 -1
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora_manager.py +24 -9
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +40 -16
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
- sglang/srt/managers/cache_controller.py +48 -17
- sglang/srt/managers/data_parallel_controller.py +146 -42
- sglang/srt/managers/detokenizer_manager.py +40 -13
- sglang/srt/managers/io_struct.py +66 -16
- sglang/srt/managers/mm_utils.py +20 -18
- sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
- sglang/srt/managers/overlap_utils.py +96 -19
- sglang/srt/managers/schedule_batch.py +241 -511
- sglang/srt/managers/schedule_policy.py +15 -2
- sglang/srt/managers/scheduler.py +399 -499
- sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
- sglang/srt/managers/tokenizer_manager.py +378 -90
- sglang/srt/managers/tp_worker.py +212 -161
- sglang/srt/managers/utils.py +78 -2
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/allocator_ascend.py +2 -2
- sglang/srt/mem_cache/base_prefix_cache.py +2 -2
- sglang/srt/mem_cache/chunk_cache.py +13 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +16 -1
- sglang/srt/mem_cache/hicache_storage.py +4 -1
- sglang/srt/mem_cache/hiradix_cache.py +16 -3
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +435 -219
- sglang/srt/mem_cache/memory_pool_host.py +0 -1
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +53 -19
- sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
- sglang/srt/mem_cache/storage/backend_factory.py +2 -2
- sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +92 -26
- sglang/srt/metrics/collector.py +31 -0
- sglang/srt/metrics/func_timer.py +1 -1
- sglang/srt/model_executor/cuda_graph_runner.py +43 -5
- sglang/srt/model_executor/forward_batch_info.py +28 -23
- sglang/srt/model_executor/model_runner.py +379 -139
- sglang/srt/model_executor/npu_graph_runner.py +2 -3
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +424 -27
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +47 -28
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +13 -52
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +19 -3
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +273 -98
- sglang/srt/models/dots_ocr.py +0 -2
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +13 -19
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/gemma3n_mm.py +1 -2
- sglang/srt/models/glm4_moe.py +14 -37
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +2 -1
- sglang/srt/models/glm4v_moe.py +5 -5
- sglang/srt/models/gpt_oss.py +5 -5
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +3 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +5 -22
- sglang/srt/models/longcat_flash_nextn.py +3 -14
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +13 -3
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2_5_vl.py +3 -3
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +15 -12
- sglang/srt/models/qwen2_vl.py +5 -2
- sglang/srt/models/qwen3_moe.py +19 -35
- sglang/srt/models/qwen3_next.py +7 -12
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +37 -33
- sglang/srt/models/qwen3_vl_moe.py +57 -185
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +0 -1
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/utils.py +11 -1
- sglang/srt/multimodal/processors/base_processor.py +6 -2
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +0 -1
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +0 -2
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +75 -16
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +17 -22
- sglang/srt/sampling/sampling_params.py +70 -2
- sglang/srt/server_args.py +577 -73
- sglang/srt/server_args_config_parser.py +1 -1
- sglang/srt/single_batch_overlap.py +38 -28
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
- sglang/srt/speculative/eagle_info.py +57 -18
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +138 -0
- sglang/srt/speculative/eagle_worker.py +83 -280
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_info.py +2 -0
- sglang/srt/speculative/spec_utils.py +38 -3
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/two_batch_overlap.py +28 -14
- sglang/srt/utils/__init__.py +1 -1
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/utils/common.py +192 -47
- sglang/srt/utils/hf_transformers_utils.py +40 -17
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +41 -0
- sglang/test/runners.py +2 -0
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +3 -0
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/test_block_fp8.py +1 -2
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +232 -99
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +81 -0
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_utils.py +85 -20
- sglang/version.py +1 -1
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/srt/models/dots_ocr.py
CHANGED
|
@@ -6,7 +6,6 @@ from typing import Iterable, List, Optional, Tuple
|
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
8
|
import torch.nn as nn
|
|
9
|
-
from transformers.activations import ACT2FN
|
|
10
9
|
|
|
11
10
|
from sglang.srt.configs import DotsOCRConfig
|
|
12
11
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
@@ -22,7 +21,6 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
|
22
21
|
from sglang.srt.models.dots_vlm_vit import DotsVisionTransformer
|
|
23
22
|
from sglang.srt.models.qwen2 import Qwen2ForCausalLM
|
|
24
23
|
from sglang.srt.utils import add_prefix
|
|
25
|
-
from sglang.srt.utils.hf_transformers_utils import get_processor
|
|
26
24
|
|
|
27
25
|
logger = logging.getLogger(__name__)
|
|
28
26
|
|
sglang/srt/models/dots_vlm.py
CHANGED
|
@@ -23,7 +23,6 @@ import torch
|
|
|
23
23
|
from torch import nn
|
|
24
24
|
|
|
25
25
|
from sglang.srt.configs.dots_vlm import DotsVLMConfig
|
|
26
|
-
from sglang.srt.distributed import parallel_state
|
|
27
26
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
28
27
|
from sglang.srt.managers.mm_utils import (
|
|
29
28
|
MultiModalityDataPaddingPatternMultimodalTokens,
|
|
@@ -323,7 +323,7 @@ class DotsVisionTransformer(PreTrainedModel):
|
|
|
323
323
|
dim=0,
|
|
324
324
|
dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
|
|
325
325
|
)
|
|
326
|
-
cu_seqlens =
|
|
326
|
+
cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
|
|
327
327
|
|
|
328
328
|
for blk in self.blocks:
|
|
329
329
|
hidden_states = blk(
|
sglang/srt/models/falcon_h1.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import enum
|
|
2
1
|
import logging
|
|
3
2
|
from typing import Any, Iterable, List, Optional, Set, Tuple
|
|
4
3
|
|
|
@@ -8,6 +7,10 @@ from torch import nn
|
|
|
8
7
|
from sglang.srt.configs.falcon_h1 import FalconH1Config
|
|
9
8
|
from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
|
|
10
9
|
from sglang.srt.layers.activation import SiluAndMul
|
|
10
|
+
from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
|
|
11
|
+
HybridLinearAttnBackend,
|
|
12
|
+
Mamba2AttnBackend,
|
|
13
|
+
)
|
|
11
14
|
from sglang.srt.layers.attention.mamba.mamba import MambaMixer2
|
|
12
15
|
from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
|
|
13
16
|
from sglang.srt.layers.dp_attention import (
|
|
@@ -29,9 +32,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
|
29
32
|
ParallelLMHead,
|
|
30
33
|
VocabParallelEmbedding,
|
|
31
34
|
)
|
|
32
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
33
35
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
|
34
36
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
37
|
+
from sglang.srt.server_args import get_global_server_args
|
|
35
38
|
from sglang.srt.utils import add_prefix, is_cuda, make_layers
|
|
36
39
|
|
|
37
40
|
logger = logging.getLogger(__name__)
|
|
@@ -184,18 +187,12 @@ class FalconH1HybridAttentionDecoderLayer(nn.Module):
|
|
|
184
187
|
)
|
|
185
188
|
|
|
186
189
|
self.mamba = MambaMixer2(
|
|
190
|
+
cache_params=config.mamba2_cache_params,
|
|
187
191
|
hidden_size=config.hidden_size,
|
|
188
|
-
ssm_state_size=config.mamba_d_state,
|
|
189
|
-
conv_kernel_size=config.mamba_d_conv,
|
|
190
|
-
intermediate_size=self.d_ssm,
|
|
191
192
|
use_conv_bias=config.mamba_conv_bias,
|
|
192
193
|
use_bias=config.mamba_proj_bias,
|
|
193
194
|
n_groups=config.mamba_n_groups,
|
|
194
|
-
num_heads=config.mamba_n_heads,
|
|
195
|
-
layer_id=layer_id,
|
|
196
|
-
head_dim=config.mamba_d_head,
|
|
197
195
|
rms_norm_eps=config.rms_norm_eps,
|
|
198
|
-
chunk_size=config.mamba_chunk_size,
|
|
199
196
|
activation=config.hidden_act,
|
|
200
197
|
use_rms_norm=config.mamba_rms_norm,
|
|
201
198
|
prefix=f"{prefix}.mixer",
|
|
@@ -339,12 +336,16 @@ class FalconH1HybridAttentionDecoderLayer(nn.Module):
|
|
|
339
336
|
)
|
|
340
337
|
attention_hidden_states = attention_hidden_states * self.attn_out_multiplier
|
|
341
338
|
|
|
339
|
+
attn_backend = forward_batch.attn_backend
|
|
340
|
+
assert isinstance(attn_backend, HybridLinearAttnBackend)
|
|
341
|
+
assert isinstance(attn_backend.linear_attn_backend, Mamba2AttnBackend)
|
|
342
342
|
# Mamba block
|
|
343
343
|
mamba_hidden_states = torch.empty_like(hidden_states)
|
|
344
|
-
|
|
344
|
+
attn_backend.linear_attn_backend.forward(
|
|
345
|
+
self.mamba,
|
|
345
346
|
hidden_states * self.ssm_in_multiplier,
|
|
346
347
|
mamba_hidden_states,
|
|
347
|
-
|
|
348
|
+
layer_id=self.layer_id,
|
|
348
349
|
mup_vector=self.mup_vector,
|
|
349
350
|
)
|
|
350
351
|
mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier
|
|
@@ -448,13 +449,6 @@ class FalconH1Model(nn.Module):
|
|
|
448
449
|
return hidden_states
|
|
449
450
|
|
|
450
451
|
|
|
451
|
-
class HybridLayerType(enum.Enum):
|
|
452
|
-
full_attention = "attention"
|
|
453
|
-
swa_attention = "swa_attention"
|
|
454
|
-
linear_attention = "linear_attention"
|
|
455
|
-
mamba2 = "mamba"
|
|
456
|
-
|
|
457
|
-
|
|
458
452
|
class FalconH1ForCausalLM(nn.Module):
|
|
459
453
|
fall_back_to_pt_during_load = False
|
|
460
454
|
|
|
@@ -481,7 +475,7 @@ class FalconH1ForCausalLM(nn.Module):
|
|
|
481
475
|
quant_config=quant_config,
|
|
482
476
|
org_num_embeddings=config.vocab_size,
|
|
483
477
|
prefix=add_prefix("lm_head", prefix),
|
|
484
|
-
use_attn_tp_group=
|
|
478
|
+
use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
|
|
485
479
|
)
|
|
486
480
|
self.lm_head = self.lm_head.float()
|
|
487
481
|
self.lm_head_multiplier = config.lm_head_multiplier
|
sglang/srt/models/gemma3_mm.py
CHANGED
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3_mm.py
|
|
17
17
|
|
|
18
18
|
import logging
|
|
19
|
+
import re
|
|
19
20
|
from functools import lru_cache
|
|
20
21
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, TypedDict
|
|
21
22
|
|
|
@@ -154,6 +155,10 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
|
|
|
154
155
|
embedding_modules = {}
|
|
155
156
|
embedding_padding_modules = []
|
|
156
157
|
supports_lora = True
|
|
158
|
+
# Pattern to match language model layers only (skip vision_tower and multi_modal_projector)
|
|
159
|
+
lora_pattern = re.compile(
|
|
160
|
+
r"^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
|
|
161
|
+
)
|
|
157
162
|
|
|
158
163
|
def __init__(
|
|
159
164
|
self,
|
|
@@ -165,6 +170,13 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
|
|
|
165
170
|
self.config = config
|
|
166
171
|
self.quant_config = quant_config
|
|
167
172
|
|
|
173
|
+
# For LoRA compatibility: expose text_config attributes at top level
|
|
174
|
+
# This allows LoRA code to work without special multimodal handling
|
|
175
|
+
if not hasattr(config, "num_hidden_layers"):
|
|
176
|
+
config.num_hidden_layers = config.text_config.num_hidden_layers
|
|
177
|
+
if not hasattr(config, "hidden_size"):
|
|
178
|
+
config.hidden_size = config.text_config.hidden_size
|
|
179
|
+
|
|
168
180
|
self.vision_tower = SiglipVisionModel(
|
|
169
181
|
config=config.vision_config,
|
|
170
182
|
quant_config=quant_config,
|
|
@@ -380,6 +392,10 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
|
|
|
380
392
|
|
|
381
393
|
return hs
|
|
382
394
|
|
|
395
|
+
def should_apply_lora(self, module_name: str) -> bool:
|
|
396
|
+
"""Skip vision tower and multi_modal_projector for LoRA."""
|
|
397
|
+
return bool(self.lora_pattern.match(module_name))
|
|
398
|
+
|
|
383
399
|
def tie_weights(self):
|
|
384
400
|
return self.language_model.tie_weights()
|
|
385
401
|
|
sglang/srt/models/gemma3n_mm.py
CHANGED
|
@@ -14,8 +14,7 @@ from transformers import (
|
|
|
14
14
|
)
|
|
15
15
|
from transformers.models.auto.modeling_auto import AutoModel
|
|
16
16
|
|
|
17
|
-
from sglang.srt.layers.
|
|
18
|
-
from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
|
|
17
|
+
from sglang.srt.layers.linear import RowParallelLinear
|
|
19
18
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
20
19
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
21
20
|
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
sglang/srt/models/glm4_moe.py
CHANGED
|
@@ -27,7 +27,6 @@ from sglang.srt.distributed import (
|
|
|
27
27
|
get_pp_group,
|
|
28
28
|
get_tensor_model_parallel_rank,
|
|
29
29
|
get_tensor_model_parallel_world_size,
|
|
30
|
-
parallel_state,
|
|
31
30
|
tensor_model_parallel_all_reduce,
|
|
32
31
|
)
|
|
33
32
|
from sglang.srt.layers.activation import SiluAndMul
|
|
@@ -44,30 +43,23 @@ from sglang.srt.layers.dp_attention import (
|
|
|
44
43
|
)
|
|
45
44
|
from sglang.srt.layers.layernorm import RMSNorm
|
|
46
45
|
from sglang.srt.layers.linear import (
|
|
47
|
-
ColumnParallelLinear,
|
|
48
46
|
MergedColumnParallelLinear,
|
|
49
47
|
QKVParallelLinear,
|
|
50
|
-
ReplicatedLinear,
|
|
51
48
|
RowParallelLinear,
|
|
52
49
|
)
|
|
53
50
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
54
|
-
from sglang.srt.layers.moe import
|
|
51
|
+
from sglang.srt.layers.moe import get_moe_a2a_backend
|
|
55
52
|
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
|
56
53
|
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
|
|
57
54
|
from sglang.srt.layers.moe.topk import TopK
|
|
58
55
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
59
|
-
from sglang.srt.layers.quantization.fp8_kernel import
|
|
60
|
-
is_fp8_fnuz,
|
|
61
|
-
per_tensor_quant_mla_fp8,
|
|
62
|
-
per_token_group_quant_mla_deep_gemm_masked_fp8,
|
|
63
|
-
)
|
|
56
|
+
from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
|
|
64
57
|
from sglang.srt.layers.radix_attention import RadixAttention
|
|
65
58
|
from sglang.srt.layers.rotary_embedding import get_rope
|
|
66
59
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
|
67
60
|
ParallelLMHead,
|
|
68
61
|
VocabParallelEmbedding,
|
|
69
62
|
)
|
|
70
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
71
63
|
from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
|
|
72
64
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
|
73
65
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
@@ -77,21 +69,17 @@ from sglang.srt.models.deepseek_v2 import (
|
|
|
77
69
|
DeepseekV2Model,
|
|
78
70
|
DeepseekV2MoE,
|
|
79
71
|
)
|
|
80
|
-
from sglang.srt.
|
|
72
|
+
from sglang.srt.server_args import get_global_server_args
|
|
81
73
|
from sglang.srt.utils import (
|
|
82
74
|
BumpAllocator,
|
|
83
75
|
LazyValue,
|
|
84
76
|
add_prefix,
|
|
85
|
-
bind_or_assign,
|
|
86
77
|
cpu_has_amx_support,
|
|
87
78
|
get_bool_env_var,
|
|
88
79
|
get_device_sm,
|
|
89
|
-
get_int_env_var,
|
|
90
80
|
is_cpu,
|
|
91
81
|
is_cuda,
|
|
92
|
-
is_flashinfer_available,
|
|
93
82
|
is_hip,
|
|
94
|
-
is_non_idle_and_non_empty,
|
|
95
83
|
log_info_on_rank0,
|
|
96
84
|
use_intel_amx_backend,
|
|
97
85
|
)
|
|
@@ -395,7 +383,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
|
395
383
|
self.n_shared_experts = config.n_shared_experts
|
|
396
384
|
self.num_fused_shared_experts = (
|
|
397
385
|
0
|
|
398
|
-
if
|
|
386
|
+
if get_global_server_args().disable_shared_experts_fusion
|
|
399
387
|
else config.n_shared_experts
|
|
400
388
|
)
|
|
401
389
|
self.config = config
|
|
@@ -432,7 +420,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
|
432
420
|
self.experts = get_moe_impl_class(quant_config)(
|
|
433
421
|
num_experts=config.n_routed_experts
|
|
434
422
|
+ self.num_fused_shared_experts
|
|
435
|
-
+
|
|
423
|
+
+ get_global_server_args().ep_num_redundant_experts,
|
|
436
424
|
num_fused_shared_experts=self.num_fused_shared_experts,
|
|
437
425
|
top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
|
|
438
426
|
hidden_size=config.hidden_size,
|
|
@@ -471,12 +459,12 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
|
471
459
|
|
|
472
460
|
self.top_k = config.num_experts_per_tok
|
|
473
461
|
|
|
474
|
-
if get_moe_a2a_backend().is_deepep():
|
|
462
|
+
if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
|
|
475
463
|
# TODO: we will support tp < ep in the future
|
|
476
464
|
self.ep_size = get_moe_expert_parallel_world_size()
|
|
477
465
|
self.num_experts = (
|
|
478
466
|
config.n_routed_experts
|
|
479
|
-
+
|
|
467
|
+
+ get_global_server_args().ep_num_redundant_experts
|
|
480
468
|
)
|
|
481
469
|
self.renormalize = config.norm_topk_prob
|
|
482
470
|
self.topk_group = config.topk_group
|
|
@@ -487,20 +475,9 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
|
|
|
487
475
|
else None
|
|
488
476
|
)
|
|
489
477
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
permute_fusion=True,
|
|
494
|
-
num_experts=self.num_experts,
|
|
495
|
-
num_local_experts=config.n_routed_experts // self.tp_size,
|
|
496
|
-
hidden_size=config.hidden_size,
|
|
497
|
-
params_dtype=config.torch_dtype,
|
|
498
|
-
deepep_mode=get_deepep_mode(),
|
|
499
|
-
async_finish=True,
|
|
500
|
-
return_recv_hook=True,
|
|
501
|
-
)
|
|
502
|
-
|
|
503
|
-
self._enable_deepep_moe = get_moe_a2a_backend().is_deepep()
|
|
478
|
+
self._enable_a2a_moe = (
|
|
479
|
+
get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
|
|
480
|
+
)
|
|
504
481
|
|
|
505
482
|
def forward_normal_dual_stream(
|
|
506
483
|
self,
|
|
@@ -664,7 +641,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
|
|
|
664
641
|
layer_scatter_modes=self.layer_scatter_modes,
|
|
665
642
|
input_layernorm=self.input_layernorm,
|
|
666
643
|
post_attention_layernorm=self.post_attention_layernorm,
|
|
667
|
-
allow_reduce_scatter=
|
|
644
|
+
allow_reduce_scatter=False,
|
|
668
645
|
)
|
|
669
646
|
|
|
670
647
|
def forward(
|
|
@@ -758,7 +735,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
|
758
735
|
config.hidden_size,
|
|
759
736
|
quant_config=quant_config,
|
|
760
737
|
prefix=add_prefix("lm_head", prefix),
|
|
761
|
-
use_attn_tp_group=
|
|
738
|
+
use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
|
|
762
739
|
)
|
|
763
740
|
self.logits_processor = LogitsProcessor(config)
|
|
764
741
|
|
|
@@ -774,7 +751,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
|
774
751
|
self, architecture: str = "Glm4MoeForCausalLM"
|
|
775
752
|
):
|
|
776
753
|
self.num_fused_shared_experts = 0
|
|
777
|
-
if
|
|
754
|
+
if get_global_server_args().disable_shared_experts_fusion:
|
|
778
755
|
return
|
|
779
756
|
|
|
780
757
|
# Only Deepseek V3/R1 can use shared experts fusion optimization now.
|
|
@@ -790,7 +767,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
|
|
|
790
767
|
disable_reason = "Deepseek and GLM-4.5 or GLM-4.6 can not use shared experts fusion optimization under expert parallelism."
|
|
791
768
|
|
|
792
769
|
if disable_reason is not None:
|
|
793
|
-
|
|
770
|
+
get_global_server_args().disable_shared_experts_fusion = True
|
|
794
771
|
self.num_fused_shared_experts = 0
|
|
795
772
|
log_info_on_rank0(
|
|
796
773
|
logger,
|
|
@@ -30,9 +30,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
|
30
30
|
ParallelLMHead,
|
|
31
31
|
VocabParallelEmbedding,
|
|
32
32
|
)
|
|
33
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
34
33
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
|
35
34
|
from sglang.srt.models.glm4_moe import Glm4MoeDecoderLayer, Glm4MoeForCausalLM
|
|
35
|
+
from sglang.srt.server_args import get_global_server_args
|
|
36
36
|
from sglang.srt.utils import BumpAllocator, add_prefix
|
|
37
37
|
|
|
38
38
|
logger = logging.getLogger(__name__)
|
|
@@ -145,7 +145,7 @@ class Glm4MoeForCausalLMNextN(Glm4MoeForCausalLM):
|
|
|
145
145
|
config.hidden_size,
|
|
146
146
|
quant_config=quant_config,
|
|
147
147
|
prefix=add_prefix("model.shared_head.head", prefix),
|
|
148
|
-
use_attn_tp_group=
|
|
148
|
+
use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
|
|
149
149
|
)
|
|
150
150
|
self.logits_processor = LogitsProcessor(config)
|
|
151
151
|
|
sglang/srt/models/glm4v.py
CHANGED
|
@@ -9,6 +9,7 @@ from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisi
|
|
|
9
9
|
|
|
10
10
|
from sglang.srt.layers.activation import SiluAndMul
|
|
11
11
|
from sglang.srt.layers.attention import vision_utils
|
|
12
|
+
from sglang.srt.layers.dp_attention import get_attention_tp_size
|
|
12
13
|
from sglang.srt.layers.layernorm import RMSNorm
|
|
13
14
|
from sglang.srt.layers.linear import (
|
|
14
15
|
ColumnParallelLinear,
|
|
@@ -434,7 +435,7 @@ class Glm4vVisionModel(nn.Module):
|
|
|
434
435
|
cu_seqlens = torch.repeat_interleave(
|
|
435
436
|
grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
|
|
436
437
|
).cumsum(dim=0, dtype=torch.int32)
|
|
437
|
-
cu_seqlens =
|
|
438
|
+
cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
|
|
438
439
|
|
|
439
440
|
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
|
440
441
|
x = self.embeddings(
|
sglang/srt/models/glm4v_moe.py
CHANGED
|
@@ -16,10 +16,10 @@ from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
|
|
16
16
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
|
17
17
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
18
18
|
from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
|
|
19
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
20
19
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
21
20
|
from sglang.srt.models.glm4_moe import Glm4MoeModel
|
|
22
21
|
from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
|
|
22
|
+
from sglang.srt.server_args import get_global_server_args
|
|
23
23
|
from sglang.srt.utils import add_prefix, is_cuda, log_info_on_rank0
|
|
24
24
|
from sglang.srt.utils.hf_transformers_utils import get_processor
|
|
25
25
|
|
|
@@ -47,7 +47,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
|
|
|
47
47
|
self.determine_num_fused_shared_experts("Glm4MoeForCausalLM")
|
|
48
48
|
self.num_fused_shared_experts = (
|
|
49
49
|
0
|
|
50
|
-
if
|
|
50
|
+
if get_global_server_args().disable_shared_experts_fusion
|
|
51
51
|
else config.n_shared_experts
|
|
52
52
|
)
|
|
53
53
|
|
|
@@ -68,7 +68,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
|
|
|
68
68
|
config.hidden_size,
|
|
69
69
|
quant_config=quant_config,
|
|
70
70
|
prefix=add_prefix("lm_head", prefix),
|
|
71
|
-
use_attn_tp_group=
|
|
71
|
+
use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
|
|
72
72
|
)
|
|
73
73
|
self.logits_processor = LogitsProcessor(config)
|
|
74
74
|
self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
|
|
@@ -81,7 +81,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
|
|
|
81
81
|
self, architecture: str = "Glm4MoeForCausalLM"
|
|
82
82
|
):
|
|
83
83
|
self.num_fused_shared_experts = 0
|
|
84
|
-
if
|
|
84
|
+
if get_global_server_args().disable_shared_experts_fusion:
|
|
85
85
|
return
|
|
86
86
|
|
|
87
87
|
# Only Deepseek V3/R1 can use shared experts fusion optimization now.
|
|
@@ -97,7 +97,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
|
|
|
97
97
|
disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
|
|
98
98
|
|
|
99
99
|
if disable_reason is not None:
|
|
100
|
-
|
|
100
|
+
get_global_server_args().disable_shared_experts_fusion = True
|
|
101
101
|
self.num_fused_shared_experts = 0
|
|
102
102
|
log_info_on_rank0(
|
|
103
103
|
logger,
|
sglang/srt/models/gpt_oss.py
CHANGED
|
@@ -63,13 +63,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
|
63
63
|
ParallelLMHead,
|
|
64
64
|
VocabParallelEmbedding,
|
|
65
65
|
)
|
|
66
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
67
66
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
|
68
67
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
69
68
|
from sglang.srt.models.utils import (
|
|
70
69
|
create_fused_set_kv_buffer_arg,
|
|
71
70
|
enable_fused_set_kv_buffer,
|
|
72
71
|
)
|
|
72
|
+
from sglang.srt.server_args import get_global_server_args
|
|
73
73
|
from sglang.srt.utils import (
|
|
74
74
|
LazyValue,
|
|
75
75
|
add_prefix,
|
|
@@ -85,7 +85,7 @@ _is_sm100_supported = is_cuda() and is_sm100_supported()
|
|
|
85
85
|
|
|
86
86
|
|
|
87
87
|
if _is_cuda:
|
|
88
|
-
from sgl_kernel import FusedSetKVBufferArg
|
|
88
|
+
from sgl_kernel import FusedSetKVBufferArg # noqa: F401
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
class GptOssConfig(PretrainedConfig):
|
|
@@ -138,7 +138,7 @@ class GptOssSparseMoeBlock(nn.Module):
|
|
|
138
138
|
}
|
|
139
139
|
self.experts = experts_type(
|
|
140
140
|
num_experts=config.num_local_experts
|
|
141
|
-
+
|
|
141
|
+
+ get_global_server_args().ep_num_redundant_experts,
|
|
142
142
|
top_k=config.num_experts_per_tok,
|
|
143
143
|
layer_id=layer_id,
|
|
144
144
|
hidden_size=config.hidden_size,
|
|
@@ -259,7 +259,7 @@ class GptOssAttention(nn.Module):
|
|
|
259
259
|
|
|
260
260
|
# Choose dtype of sinks based on attention backend: trtllm_mha requires float32,
|
|
261
261
|
# others can use bfloat16
|
|
262
|
-
attn_backend =
|
|
262
|
+
attn_backend = get_global_server_args().attention_backend
|
|
263
263
|
sinks_dtype = torch.float32 if attn_backend == "trtllm_mha" else torch.bfloat16
|
|
264
264
|
self.sinks = nn.Parameter(
|
|
265
265
|
torch.empty(self.num_heads, dtype=sinks_dtype), requires_grad=False
|
|
@@ -591,7 +591,7 @@ class GptOssForCausalLM(nn.Module):
|
|
|
591
591
|
config.hidden_size,
|
|
592
592
|
# quant_config=quant_config,
|
|
593
593
|
prefix=add_prefix("lm_head", prefix),
|
|
594
|
-
use_attn_tp_group=
|
|
594
|
+
use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
|
|
595
595
|
)
|
|
596
596
|
self.logits_processor = LogitsProcessor(config)
|
|
597
597
|
self.capture_aux_hidden_states = False
|
sglang/srt/models/grok.py
CHANGED
|
@@ -28,7 +28,6 @@ from torch import nn
|
|
|
28
28
|
from transformers import PretrainedConfig
|
|
29
29
|
|
|
30
30
|
from sglang.srt.distributed import (
|
|
31
|
-
get_moe_expert_parallel_world_size,
|
|
32
31
|
get_tensor_model_parallel_rank,
|
|
33
32
|
get_tensor_model_parallel_world_size,
|
|
34
33
|
tensor_model_parallel_all_gather,
|
|
@@ -36,7 +35,6 @@ from sglang.srt.distributed import (
|
|
|
36
35
|
)
|
|
37
36
|
from sglang.srt.layers.activation import GeluAndMul
|
|
38
37
|
from sglang.srt.layers.elementwise import (
|
|
39
|
-
experts_combine_triton,
|
|
40
38
|
fused_dual_residual_rmsnorm,
|
|
41
39
|
fused_rmsnorm,
|
|
42
40
|
gelu_and_mul_triton,
|
|
@@ -49,7 +47,6 @@ from sglang.srt.layers.linear import (
|
|
|
49
47
|
RowParallelLinear,
|
|
50
48
|
)
|
|
51
49
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
52
|
-
from sglang.srt.layers.moe.ep_moe.layer import EPMoE
|
|
53
50
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
|
54
51
|
from sglang.srt.layers.moe.router import fused_moe_router_shim
|
|
55
52
|
from sglang.srt.layers.moe.topk import TopK
|
|
@@ -65,10 +62,10 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
|
65
62
|
ParallelLMHead,
|
|
66
63
|
VocabParallelEmbedding,
|
|
67
64
|
)
|
|
68
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
69
65
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
|
70
66
|
from sglang.srt.model_loader.loader import DefaultModelLoader
|
|
71
67
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
68
|
+
from sglang.srt.server_args import get_global_server_args
|
|
72
69
|
from sglang.srt.utils import add_prefix, dispose_tensor, dump_to_file
|
|
73
70
|
|
|
74
71
|
logger = logging.getLogger(__name__)
|
|
@@ -76,9 +73,6 @@ logger = logging.getLogger(__name__)
|
|
|
76
73
|
|
|
77
74
|
# Dump tensors for debugging
|
|
78
75
|
debug_tensor_dump_output_folder = None
|
|
79
|
-
debug_tensor_dump_prefill_only = False
|
|
80
|
-
# Skip all the other tensor dumps, only dump the target logits
|
|
81
|
-
debug_tensor_dump_only_target_logprobs = False
|
|
82
76
|
debug_tensor_dump_inject = False
|
|
83
77
|
debug_tensor_dump_layers = None
|
|
84
78
|
debug_tensor_dump_test = False
|
|
@@ -176,17 +170,7 @@ class Grok1MoE(nn.Module):
|
|
|
176
170
|
custom_routing_function=custom_routing_function,
|
|
177
171
|
)
|
|
178
172
|
|
|
179
|
-
|
|
180
|
-
if get_moe_expert_parallel_world_size() > 1:
|
|
181
|
-
MoEImpl = EPMoE
|
|
182
|
-
else:
|
|
183
|
-
MoEImpl = FusedMoE
|
|
184
|
-
kwargs["reduce_results"] = reduce_results
|
|
185
|
-
kwargs["use_presharded_weights"] = use_presharded_weights
|
|
186
|
-
kwargs["inplace"] = inplace
|
|
187
|
-
kwargs["no_combine"] = no_combine
|
|
188
|
-
|
|
189
|
-
self.experts = MoEImpl(
|
|
173
|
+
self.experts = FusedMoE(
|
|
190
174
|
num_experts=num_experts,
|
|
191
175
|
top_k=top_k,
|
|
192
176
|
layer_id=layer_id,
|
|
@@ -195,7 +179,10 @@ class Grok1MoE(nn.Module):
|
|
|
195
179
|
params_dtype=params_dtype,
|
|
196
180
|
quant_config=quant_config,
|
|
197
181
|
activation="gelu",
|
|
198
|
-
|
|
182
|
+
reduce_results=reduce_results,
|
|
183
|
+
use_presharded_weights=use_presharded_weights,
|
|
184
|
+
inplace=inplace,
|
|
185
|
+
no_combine=no_combine,
|
|
199
186
|
)
|
|
200
187
|
|
|
201
188
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
|
@@ -877,10 +864,10 @@ class Grok1ForCausalLM(nn.Module):
|
|
|
877
864
|
|
|
878
865
|
# Dump tensors for debugging
|
|
879
866
|
global debug_tensor_dump_output_folder, debug_tensor_dump_inject
|
|
880
|
-
debug_tensor_dump_output_folder =
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
debug_tensor_dump_inject =
|
|
867
|
+
debug_tensor_dump_output_folder = (
|
|
868
|
+
get_global_server_args().debug_tensor_dump_output_folder
|
|
869
|
+
)
|
|
870
|
+
debug_tensor_dump_inject = get_global_server_args().debug_tensor_dump_inject
|
|
884
871
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
885
872
|
|
|
886
873
|
if get_tensor_model_parallel_rank() == 0:
|
sglang/srt/models/hunyuan.py
CHANGED
|
@@ -12,18 +12,14 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Inference-only HunYuan model compatible with HuggingFace weights."""
|
|
15
|
-
import logging
|
|
16
15
|
import re
|
|
17
|
-
from
|
|
18
|
-
from enum import Enum, auto
|
|
19
|
-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
16
|
+
from typing import Any, Dict, Iterable, Optional, Tuple
|
|
20
17
|
|
|
21
18
|
import torch
|
|
22
19
|
from torch import nn
|
|
23
20
|
from transformers import PretrainedConfig
|
|
24
21
|
|
|
25
22
|
from sglang.srt.distributed import (
|
|
26
|
-
get_pp_group,
|
|
27
23
|
get_tensor_model_parallel_rank,
|
|
28
24
|
get_tensor_model_parallel_world_size,
|
|
29
25
|
tensor_model_parallel_all_reduce,
|
|
@@ -46,7 +42,6 @@ from sglang.srt.layers.radix_attention import RadixAttention
|
|
|
46
42
|
from sglang.srt.layers.rotary_embedding import get_rope
|
|
47
43
|
from sglang.srt.layers.sampler import Sampler
|
|
48
44
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
|
49
|
-
DEFAULT_VOCAB_PADDING_SIZE,
|
|
50
45
|
ParallelLMHead,
|
|
51
46
|
VocabParallelEmbedding,
|
|
52
47
|
)
|
|
@@ -56,7 +51,7 @@ from sglang.srt.model_loader.weight_utils import (
|
|
|
56
51
|
kv_cache_scales_loader,
|
|
57
52
|
maybe_remap_kv_scale_name,
|
|
58
53
|
)
|
|
59
|
-
from sglang.srt.utils import
|
|
54
|
+
from sglang.srt.utils import is_hip
|
|
60
55
|
|
|
61
56
|
expert_distribution_recorder = ExpertDistributionRecorder()
|
|
62
57
|
|
sglang/srt/models/interns1.py
CHANGED
|
@@ -5,7 +5,6 @@ from torch import nn
|
|
|
5
5
|
from transformers import PretrainedConfig
|
|
6
6
|
|
|
7
7
|
from sglang.srt.layers.attention import vision_utils
|
|
8
|
-
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
|
9
8
|
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
|
|
10
9
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
11
10
|
from sglang.srt.managers.mm_utils import (
|
sglang/srt/models/kimi_vl.py
CHANGED
|
@@ -43,10 +43,8 @@
|
|
|
43
43
|
|
|
44
44
|
import copy
|
|
45
45
|
import logging
|
|
46
|
-
import math
|
|
47
|
-
from collections.abc import Mapping
|
|
48
46
|
from dataclasses import dataclass
|
|
49
|
-
from typing import
|
|
47
|
+
from typing import Iterable, List, Optional, Tuple
|
|
50
48
|
|
|
51
49
|
import torch
|
|
52
50
|
from torch import nn
|
|
@@ -56,10 +54,6 @@ from sglang.srt.configs import KimiVLConfig
|
|
|
56
54
|
from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
|
|
57
55
|
from sglang.srt.configs.kimi_vl import KimiVLConfig
|
|
58
56
|
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
|
59
|
-
from sglang.srt.distributed import (
|
|
60
|
-
get_tensor_model_parallel_rank,
|
|
61
|
-
get_tensor_model_parallel_world_size,
|
|
62
|
-
)
|
|
63
57
|
from sglang.srt.layers.activation import QuickGELU
|
|
64
58
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
|
65
59
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
|
@@ -49,7 +49,7 @@ from typing import List, Optional, Sequence, Tuple, Union
|
|
|
49
49
|
import torch
|
|
50
50
|
import torch.nn as nn
|
|
51
51
|
import torch.nn.functional as F
|
|
52
|
-
from transformers.activations import ACT2FN
|
|
52
|
+
from transformers.activations import ACT2FN
|
|
53
53
|
from transformers.modeling_utils import PreTrainedModel
|
|
54
54
|
|
|
55
55
|
try:
|
|
@@ -596,6 +596,8 @@ class MoonVitPretrainedModel(PreTrainedModel):
|
|
|
596
596
|
_supports_sdpa = True
|
|
597
597
|
|
|
598
598
|
def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
|
|
599
|
+
from transformers.activations import GELUTanh
|
|
600
|
+
|
|
599
601
|
super().__init__(config, *inputs, **kwargs)
|
|
600
602
|
config = deepcopy(config)
|
|
601
603
|
self.merge_kernel_size = config.merge_kernel_size
|
sglang/srt/models/llama.py
CHANGED
|
@@ -45,13 +45,13 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
|
45
45
|
ParallelLMHead,
|
|
46
46
|
VocabParallelEmbedding,
|
|
47
47
|
)
|
|
48
|
-
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
|
49
48
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
|
50
49
|
from sglang.srt.model_loader.weight_utils import (
|
|
51
50
|
default_weight_loader,
|
|
52
51
|
kv_cache_scales_loader,
|
|
53
52
|
maybe_remap_kv_scale_name,
|
|
54
53
|
)
|
|
54
|
+
from sglang.srt.server_args import get_global_server_args
|
|
55
55
|
from sglang.srt.utils import add_prefix, make_layers
|
|
56
56
|
from sglang.utils import get_exception_traceback
|
|
57
57
|
|
|
@@ -433,7 +433,7 @@ class LlamaForCausalLM(nn.Module):
|
|
|
433
433
|
config.hidden_size,
|
|
434
434
|
quant_config=quant_config,
|
|
435
435
|
prefix=add_prefix("lm_head", prefix),
|
|
436
|
-
use_attn_tp_group=
|
|
436
|
+
use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
|
|
437
437
|
)
|
|
438
438
|
self.logits_processor = LogitsProcessor(config)
|
|
439
439
|
self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
|