sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
 - sglang/bench_one_batch_server.py +340 -34
 - sglang/bench_serving.py +340 -159
 - sglang/check_env.py +1 -1
 - sglang/compile_deep_gemm.py +6 -2
 - sglang/global_config.py +1 -25
 - sglang/lang/api.py +6 -0
 - sglang/lang/backend/runtime_endpoint.py +1 -1
 - sglang/lang/interpreter.py +1 -0
 - sglang/lang/ir.py +13 -0
 - sglang/launch_server.py +9 -2
 - sglang/profiler.py +20 -3
 - sglang/srt/_custom_ops.py +1 -1
 - sglang/srt/batch_invariant_ops/__init__.py +27 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
 - sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
 - sglang/srt/compilation/backend.py +437 -0
 - sglang/srt/compilation/compilation_config.py +20 -0
 - sglang/srt/compilation/compilation_counter.py +47 -0
 - sglang/srt/compilation/compile.py +210 -0
 - sglang/srt/compilation/compiler_interface.py +503 -0
 - sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
 - sglang/srt/compilation/fix_functionalization.py +134 -0
 - sglang/srt/compilation/fx_utils.py +83 -0
 - sglang/srt/compilation/inductor_pass.py +140 -0
 - sglang/srt/compilation/pass_manager.py +66 -0
 - sglang/srt/compilation/piecewise_context_manager.py +40 -0
 - sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
 - sglang/srt/configs/__init__.py +8 -0
 - sglang/srt/configs/deepseek_ocr.py +262 -0
 - sglang/srt/configs/deepseekvl2.py +194 -96
 - sglang/srt/configs/dots_ocr.py +64 -0
 - sglang/srt/configs/dots_vlm.py +2 -7
 - sglang/srt/configs/falcon_h1.py +309 -0
 - sglang/srt/configs/load_config.py +33 -2
 - sglang/srt/configs/mamba_utils.py +117 -0
 - sglang/srt/configs/model_config.py +284 -118
 - sglang/srt/configs/modelopt_config.py +30 -0
 - sglang/srt/configs/nemotron_h.py +286 -0
 - sglang/srt/configs/olmo3.py +105 -0
 - sglang/srt/configs/points_v15_chat.py +29 -0
 - sglang/srt/configs/qwen3_next.py +11 -47
 - sglang/srt/configs/qwen3_omni.py +613 -0
 - sglang/srt/configs/qwen3_vl.py +576 -0
 - sglang/srt/connector/remote_instance.py +1 -1
 - sglang/srt/constrained/base_grammar_backend.py +6 -1
 - sglang/srt/constrained/llguidance_backend.py +5 -0
 - sglang/srt/constrained/outlines_backend.py +1 -1
 - sglang/srt/constrained/outlines_jump_forward.py +1 -1
 - sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
 - sglang/srt/constrained/utils.py +12 -0
 - sglang/srt/constrained/xgrammar_backend.py +26 -15
 - sglang/srt/debug_utils/dumper.py +10 -3
 - sglang/srt/disaggregation/ascend/conn.py +2 -2
 - sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
 - sglang/srt/disaggregation/base/conn.py +17 -4
 - sglang/srt/disaggregation/common/conn.py +268 -98
 - sglang/srt/disaggregation/decode.py +172 -39
 - sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
 - sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
 - sglang/srt/disaggregation/fake/conn.py +11 -3
 - sglang/srt/disaggregation/mooncake/conn.py +203 -555
 - sglang/srt/disaggregation/nixl/conn.py +217 -63
 - sglang/srt/disaggregation/prefill.py +113 -270
 - sglang/srt/disaggregation/utils.py +36 -5
 - sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
 - sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
 - sglang/srt/distributed/device_communicators/pynccl.py +24 -12
 - sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
 - sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
 - sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
 - sglang/srt/distributed/naive_distributed.py +5 -4
 - sglang/srt/distributed/parallel_state.py +203 -97
 - sglang/srt/elastic_ep/elastic_ep.py +74 -0
 - sglang/srt/entrypoints/context.py +3 -2
 - sglang/srt/entrypoints/engine.py +85 -65
 - sglang/srt/entrypoints/grpc_server.py +632 -305
 - sglang/srt/entrypoints/harmony_utils.py +2 -2
 - sglang/srt/entrypoints/http_server.py +169 -17
 - sglang/srt/entrypoints/http_server_engine.py +1 -7
 - sglang/srt/entrypoints/openai/protocol.py +327 -34
 - sglang/srt/entrypoints/openai/serving_base.py +74 -8
 - sglang/srt/entrypoints/openai/serving_chat.py +202 -118
 - sglang/srt/entrypoints/openai/serving_classify.py +204 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +20 -4
 - sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
 - sglang/srt/entrypoints/openai/serving_responses.py +47 -2
 - sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
 - sglang/srt/environ.py +323 -0
 - sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
 - sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
 - sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
 - sglang/srt/eplb/expert_distribution.py +3 -4
 - sglang/srt/eplb/expert_location.py +30 -5
 - sglang/srt/eplb/expert_location_dispatch.py +2 -2
 - sglang/srt/eplb/expert_location_updater.py +2 -2
 - sglang/srt/function_call/base_format_detector.py +17 -18
 - sglang/srt/function_call/function_call_parser.py +21 -16
 - sglang/srt/function_call/glm4_moe_detector.py +4 -8
 - sglang/srt/function_call/gpt_oss_detector.py +24 -1
 - sglang/srt/function_call/json_array_parser.py +61 -0
 - sglang/srt/function_call/kimik2_detector.py +17 -4
 - sglang/srt/function_call/utils.py +98 -7
 - sglang/srt/grpc/compile_proto.py +245 -0
 - sglang/srt/grpc/grpc_request_manager.py +915 -0
 - sglang/srt/grpc/health_servicer.py +189 -0
 - sglang/srt/grpc/scheduler_launcher.py +181 -0
 - sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
 - sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
 - sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
 - sglang/srt/layers/activation.py +11 -7
 - sglang/srt/layers/attention/aiter_backend.py +17 -18
 - sglang/srt/layers/attention/ascend_backend.py +125 -10
 - sglang/srt/layers/attention/attention_registry.py +226 -0
 - sglang/srt/layers/attention/base_attn_backend.py +32 -4
 - sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
 - sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
 - sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
 - sglang/srt/layers/attention/fla/chunk.py +0 -1
 - sglang/srt/layers/attention/fla/chunk_o.py +1 -1
 - sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
 - sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
 - sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
 - sglang/srt/layers/attention/fla/index.py +0 -2
 - sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
 - sglang/srt/layers/attention/fla/utils.py +0 -3
 - sglang/srt/layers/attention/fla/wy_fast.py +0 -2
 - sglang/srt/layers/attention/flashattention_backend.py +52 -15
 - sglang/srt/layers/attention/flashinfer_backend.py +357 -212
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
 - sglang/srt/layers/attention/flashmla_backend.py +9 -7
 - sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
 - sglang/srt/layers/attention/intel_amx_backend.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
 - sglang/srt/layers/attention/mamba/mamba.py +514 -1
 - sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
 - sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
 - sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
 - sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
 - sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
 - sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
 - sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
 - sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
 - sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
 - sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
 - sglang/srt/layers/attention/nsa/transform_index.py +144 -0
 - sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
 - sglang/srt/layers/attention/nsa/utils.py +23 -0
 - sglang/srt/layers/attention/nsa_backend.py +1201 -0
 - sglang/srt/layers/attention/tbo_backend.py +6 -6
 - sglang/srt/layers/attention/torch_flex_backend.py +325 -0
 - sglang/srt/layers/attention/triton_backend.py +249 -42
 - sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
 - sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
 - sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
 - sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
 - sglang/srt/layers/attention/utils.py +11 -7
 - sglang/srt/layers/attention/vision.py +61 -3
 - sglang/srt/layers/attention/wave_backend.py +4 -4
 - sglang/srt/layers/attention/xpu_backend.py +1028 -0
 - sglang/srt/layers/communicator.py +19 -7
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
 - sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
 - sglang/srt/layers/dp_attention.py +28 -1
 - sglang/srt/layers/elementwise.py +3 -1
 - sglang/srt/layers/layernorm.py +47 -15
 - sglang/srt/layers/linear.py +30 -5
 - sglang/srt/layers/logits_processor.py +161 -18
 - sglang/srt/layers/modelopt_utils.py +11 -0
 - sglang/srt/layers/moe/cutlass_moe.py +0 -2
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
 - sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
 - sglang/srt/layers/moe/ep_moe/layer.py +243 -448
 - sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton.py +3 -1
 - sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
 - sglang/srt/layers/moe/router.py +51 -15
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
 - sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
 - sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
 - sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
 - sglang/srt/layers/moe/topk.py +3 -2
 - sglang/srt/layers/moe/utils.py +27 -1
 - sglang/srt/layers/parameter.py +23 -6
 - sglang/srt/layers/quantization/__init__.py +2 -53
 - sglang/srt/layers/quantization/awq.py +183 -6
 - sglang/srt/layers/quantization/awq_triton.py +29 -0
 - sglang/srt/layers/quantization/base_config.py +20 -1
 - sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
 - sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
 - sglang/srt/layers/quantization/fp8.py +86 -20
 - sglang/srt/layers/quantization/fp8_kernel.py +55 -10
 - sglang/srt/layers/quantization/fp8_utils.py +43 -15
 - sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
 - sglang/srt/layers/quantization/gptq.py +0 -1
 - sglang/srt/layers/quantization/int8_kernel.py +18 -2
 - sglang/srt/layers/quantization/marlin_utils.py +12 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +141 -81
 - sglang/srt/layers/quantization/mxfp4.py +17 -34
 - sglang/srt/layers/quantization/petit.py +1 -1
 - sglang/srt/layers/quantization/quark/quark.py +3 -1
 - sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
 - sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
 - sglang/srt/layers/quantization/unquant.py +1 -4
 - sglang/srt/layers/quantization/utils.py +0 -1
 - sglang/srt/layers/quantization/w4afp8.py +51 -24
 - sglang/srt/layers/quantization/w8a8_int8.py +45 -27
 - sglang/srt/layers/radix_attention.py +59 -9
 - sglang/srt/layers/rotary_embedding.py +750 -46
 - sglang/srt/layers/sampler.py +84 -16
 - sglang/srt/layers/sparse_pooler.py +98 -0
 - sglang/srt/layers/utils.py +23 -1
 - sglang/srt/layers/vocab_parallel_embedding.py +4 -1
 - sglang/srt/lora/backend/base_backend.py +3 -3
 - sglang/srt/lora/backend/chunked_backend.py +348 -0
 - sglang/srt/lora/backend/triton_backend.py +9 -4
 - sglang/srt/lora/eviction_policy.py +139 -0
 - sglang/srt/lora/lora.py +7 -5
 - sglang/srt/lora/lora_manager.py +33 -7
 - sglang/srt/lora/lora_registry.py +1 -1
 - sglang/srt/lora/mem_pool.py +41 -17
 - sglang/srt/lora/triton_ops/__init__.py +4 -0
 - sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
 - sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
 - sglang/srt/lora/utils.py +7 -5
 - sglang/srt/managers/cache_controller.py +83 -152
 - sglang/srt/managers/data_parallel_controller.py +156 -87
 - sglang/srt/managers/detokenizer_manager.py +51 -24
 - sglang/srt/managers/io_struct.py +223 -129
 - sglang/srt/managers/mm_utils.py +49 -10
 - sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
 - sglang/srt/managers/multimodal_processor.py +1 -2
 - sglang/srt/managers/overlap_utils.py +130 -0
 - sglang/srt/managers/schedule_batch.py +340 -529
 - sglang/srt/managers/schedule_policy.py +158 -18
 - sglang/srt/managers/scheduler.py +665 -620
 - sglang/srt/managers/scheduler_input_blocker.py +1 -1
 - sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
 - sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
 - sglang/srt/managers/scheduler_pp_mixin.py +341 -0
 - sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
 - sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
 - sglang/srt/managers/tokenizer_manager.py +462 -226
 - sglang/srt/managers/tp_worker.py +217 -156
 - sglang/srt/managers/utils.py +79 -47
 - sglang/srt/mem_cache/allocator.py +21 -22
 - sglang/srt/mem_cache/allocator_ascend.py +42 -28
 - sglang/srt/mem_cache/base_prefix_cache.py +3 -3
 - sglang/srt/mem_cache/chunk_cache.py +20 -2
 - sglang/srt/mem_cache/common.py +480 -0
 - sglang/srt/mem_cache/evict_policy.py +38 -0
 - sglang/srt/mem_cache/hicache_storage.py +44 -2
 - sglang/srt/mem_cache/hiradix_cache.py +134 -34
 - sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
 - sglang/srt/mem_cache/memory_pool.py +602 -208
 - sglang/srt/mem_cache/memory_pool_host.py +134 -183
 - sglang/srt/mem_cache/multimodal_cache.py +0 -1
 - sglang/srt/mem_cache/radix_cache.py +263 -78
 - sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
 - sglang/srt/mem_cache/storage/__init__.py +10 -0
 - sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
 - sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
 - sglang/srt/mem_cache/storage/backend_factory.py +223 -0
 - sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
 - sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
 - sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
 - sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
 - sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
 - sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
 - sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
 - sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
 - sglang/srt/mem_cache/swa_radix_cache.py +115 -58
 - sglang/srt/metrics/collector.py +113 -120
 - sglang/srt/metrics/func_timer.py +3 -8
 - sglang/srt/metrics/utils.py +8 -1
 - sglang/srt/model_executor/cpu_graph_runner.py +2 -2
 - sglang/srt/model_executor/cuda_graph_runner.py +81 -36
 - sglang/srt/model_executor/forward_batch_info.py +40 -50
 - sglang/srt/model_executor/model_runner.py +507 -319
 - sglang/srt/model_executor/npu_graph_runner.py +11 -5
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
 - sglang/srt/model_loader/__init__.py +1 -1
 - sglang/srt/model_loader/loader.py +438 -37
 - sglang/srt/model_loader/utils.py +0 -1
 - sglang/srt/model_loader/weight_utils.py +200 -27
 - sglang/srt/models/apertus.py +2 -3
 - sglang/srt/models/arcee.py +2 -2
 - sglang/srt/models/bailing_moe.py +40 -56
 - sglang/srt/models/bailing_moe_nextn.py +3 -4
 - sglang/srt/models/bert.py +1 -1
 - sglang/srt/models/deepseek_nextn.py +25 -4
 - sglang/srt/models/deepseek_ocr.py +1516 -0
 - sglang/srt/models/deepseek_v2.py +793 -235
 - sglang/srt/models/dots_ocr.py +171 -0
 - sglang/srt/models/dots_vlm.py +0 -1
 - sglang/srt/models/dots_vlm_vit.py +1 -1
 - sglang/srt/models/falcon_h1.py +570 -0
 - sglang/srt/models/gemma3_causal.py +0 -2
 - sglang/srt/models/gemma3_mm.py +17 -1
 - sglang/srt/models/gemma3n_mm.py +2 -3
 - sglang/srt/models/glm4_moe.py +17 -40
 - sglang/srt/models/glm4_moe_nextn.py +4 -4
 - sglang/srt/models/glm4v.py +3 -2
 - sglang/srt/models/glm4v_moe.py +6 -6
 - sglang/srt/models/gpt_oss.py +12 -35
 - sglang/srt/models/grok.py +10 -23
 - sglang/srt/models/hunyuan.py +2 -7
 - sglang/srt/models/interns1.py +0 -1
 - sglang/srt/models/kimi_vl.py +1 -7
 - sglang/srt/models/kimi_vl_moonvit.py +4 -2
 - sglang/srt/models/llama.py +6 -2
 - sglang/srt/models/llama_eagle3.py +1 -1
 - sglang/srt/models/longcat_flash.py +6 -23
 - sglang/srt/models/longcat_flash_nextn.py +4 -15
 - sglang/srt/models/mimo.py +2 -13
 - sglang/srt/models/mimo_mtp.py +1 -2
 - sglang/srt/models/minicpmo.py +7 -5
 - sglang/srt/models/mixtral.py +1 -4
 - sglang/srt/models/mllama.py +1 -1
 - sglang/srt/models/mllama4.py +27 -6
 - sglang/srt/models/nemotron_h.py +511 -0
 - sglang/srt/models/olmo2.py +31 -4
 - sglang/srt/models/opt.py +5 -5
 - sglang/srt/models/phi.py +1 -1
 - sglang/srt/models/phi4mm.py +1 -1
 - sglang/srt/models/phimoe.py +0 -1
 - sglang/srt/models/pixtral.py +0 -3
 - sglang/srt/models/points_v15_chat.py +186 -0
 - sglang/srt/models/qwen.py +0 -1
 - sglang/srt/models/qwen2.py +0 -7
 - sglang/srt/models/qwen2_5_vl.py +5 -5
 - sglang/srt/models/qwen2_audio.py +2 -15
 - sglang/srt/models/qwen2_moe.py +70 -4
 - sglang/srt/models/qwen2_vl.py +6 -3
 - sglang/srt/models/qwen3.py +18 -3
 - sglang/srt/models/qwen3_moe.py +50 -38
 - sglang/srt/models/qwen3_next.py +43 -21
 - sglang/srt/models/qwen3_next_mtp.py +3 -4
 - sglang/srt/models/qwen3_omni_moe.py +661 -0
 - sglang/srt/models/qwen3_vl.py +791 -0
 - sglang/srt/models/qwen3_vl_moe.py +343 -0
 - sglang/srt/models/registry.py +15 -3
 - sglang/srt/models/roberta.py +55 -3
 - sglang/srt/models/sarashina2_vision.py +268 -0
 - sglang/srt/models/solar.py +505 -0
 - sglang/srt/models/starcoder2.py +357 -0
 - sglang/srt/models/step3_vl.py +3 -5
 - sglang/srt/models/torch_native_llama.py +9 -2
 - sglang/srt/models/utils.py +61 -0
 - sglang/srt/multimodal/processors/base_processor.py +21 -9
 - sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
 - sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
 - sglang/srt/multimodal/processors/dots_vlm.py +2 -4
 - sglang/srt/multimodal/processors/glm4v.py +1 -5
 - sglang/srt/multimodal/processors/internvl.py +20 -10
 - sglang/srt/multimodal/processors/janus_pro.py +0 -1
 - sglang/srt/multimodal/processors/mllama4.py +0 -8
 - sglang/srt/multimodal/processors/phi4mm.py +0 -1
 - sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
 - sglang/srt/multimodal/processors/qwen_vl.py +83 -17
 - sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
 - sglang/srt/multimodal/processors/step3_vl.py +1 -1
 - sglang/srt/parser/conversation.py +41 -0
 - sglang/srt/parser/jinja_template_utils.py +6 -0
 - sglang/srt/parser/reasoning_parser.py +0 -1
 - sglang/srt/sampling/custom_logit_processor.py +77 -2
 - sglang/srt/sampling/sampling_batch_info.py +36 -23
 - sglang/srt/sampling/sampling_params.py +75 -0
 - sglang/srt/server_args.py +1300 -338
 - sglang/srt/server_args_config_parser.py +146 -0
 - sglang/srt/single_batch_overlap.py +161 -0
 - sglang/srt/speculative/base_spec_worker.py +34 -0
 - sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
 - sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
 - sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
 - sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
 - sglang/srt/speculative/cpp_ngram/param.h +125 -0
 - sglang/srt/speculative/cpp_ngram/queue.h +71 -0
 - sglang/srt/speculative/draft_utils.py +226 -0
 - sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
 - sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
 - sglang/srt/speculative/eagle_info.py +786 -0
 - sglang/srt/speculative/eagle_info_v2.py +458 -0
 - sglang/srt/speculative/eagle_utils.py +113 -1270
 - sglang/srt/speculative/eagle_worker.py +120 -285
 - sglang/srt/speculative/eagle_worker_v2.py +702 -0
 - sglang/srt/speculative/ngram_info.py +433 -0
 - sglang/srt/speculative/ngram_worker.py +246 -0
 - sglang/srt/speculative/spec_info.py +49 -0
 - sglang/srt/speculative/spec_utils.py +641 -0
 - sglang/srt/speculative/standalone_worker.py +4 -14
 - sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
 - sglang/srt/tracing/trace.py +32 -6
 - sglang/srt/two_batch_overlap.py +35 -18
 - sglang/srt/utils/__init__.py +2 -0
 - sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
 - sglang/srt/{utils.py → utils/common.py} +583 -113
 - sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
 - sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
 - sglang/srt/{offloader.py → utils/offloader.py} +4 -4
 - sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
 - sglang/srt/utils/profile_merger.py +199 -0
 - sglang/srt/utils/rpd_utils.py +452 -0
 - sglang/srt/utils/slow_rank_detector.py +71 -0
 - sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
 - sglang/srt/warmup.py +8 -4
 - sglang/srt/weight_sync/utils.py +1 -1
 - sglang/test/attention/test_flashattn_backend.py +1 -1
 - sglang/test/attention/test_flashattn_mla_backend.py +0 -1
 - sglang/test/attention/test_prefix_chunk_info.py +0 -2
 - sglang/test/attention/test_trtllm_mla_backend.py +221 -53
 - sglang/test/few_shot_gsm8k_engine.py +2 -4
 - sglang/test/get_logits_ut.py +57 -0
 - sglang/test/kit_matched_stop.py +157 -0
 - sglang/test/longbench_v2/__init__.py +1 -0
 - sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
 - sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
 - sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
 - sglang/test/run_eval.py +120 -11
 - sglang/test/runners.py +3 -1
 - sglang/test/send_one.py +42 -7
 - sglang/test/simple_eval_common.py +8 -2
 - sglang/test/simple_eval_gpqa.py +0 -1
 - sglang/test/simple_eval_humaneval.py +0 -3
 - sglang/test/simple_eval_longbench_v2.py +344 -0
 - sglang/test/simple_eval_mmmu_vlm.py +441 -0
 - sglang/test/test_block_fp8.py +3 -4
 - sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
 - sglang/test/test_cutlass_moe.py +1 -2
 - sglang/test/test_cutlass_w4a8_moe.py +10 -20
 - sglang/test/test_deterministic.py +430 -0
 - sglang/test/test_deterministic_utils.py +73 -0
 - sglang/test/test_disaggregation_utils.py +93 -1
 - sglang/test/test_marlin_moe.py +0 -1
 - sglang/test/test_programs.py +1 -1
 - sglang/test/test_utils.py +432 -16
 - sglang/utils.py +10 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
 - sglang/srt/entrypoints/grpc_request_manager.py +0 -580
 - sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
 - sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
 - sglang/srt/mem_cache/lora_radix_cache.py +0 -421
 - sglang/srt/speculative/build_eagle_tree.py +0 -427
 - sglang/test/test_block_fp8_ep.py +0 -358
 - /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
 - /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
 - /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
 - /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
 
    
        sglang/srt/models/glm4_moe.py
    CHANGED
    
    | 
         @@ -12,7 +12,7 @@ 
     | 
|
| 
       12 
12 
     | 
    
         
             
            # limitations under the License.
         
     | 
| 
       13 
13 
     | 
    
         
             
            # ==============================================================================
         
     | 
| 
       14 
14 
     | 
    
         | 
| 
       15 
     | 
    
         
            -
            """Inference-only GLM-4.5 model compatible with HuggingFace weights"""
         
     | 
| 
      
 15 
     | 
    
         
            +
            """Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights"""
         
     | 
| 
       16 
16 
     | 
    
         | 
| 
       17 
17 
     | 
    
         
             
            import logging
         
     | 
| 
       18 
18 
     | 
    
         
             
            from typing import Any, Dict, Iterable, Optional, Tuple
         
     | 
| 
         @@ -27,7 +27,6 @@ from sglang.srt.distributed import ( 
     | 
|
| 
       27 
27 
     | 
    
         
             
                get_pp_group,
         
     | 
| 
       28 
28 
     | 
    
         
             
                get_tensor_model_parallel_rank,
         
     | 
| 
       29 
29 
     | 
    
         
             
                get_tensor_model_parallel_world_size,
         
     | 
| 
       30 
     | 
    
         
            -
                parallel_state,
         
     | 
| 
       31 
30 
     | 
    
         
             
                tensor_model_parallel_all_reduce,
         
     | 
| 
       32 
31 
     | 
    
         
             
            )
         
     | 
| 
       33 
32 
     | 
    
         
             
            from sglang.srt.layers.activation import SiluAndMul
         
     | 
| 
         @@ -44,30 +43,23 @@ from sglang.srt.layers.dp_attention import ( 
     | 
|
| 
       44 
43 
     | 
    
         
             
            )
         
     | 
| 
       45 
44 
     | 
    
         
             
            from sglang.srt.layers.layernorm import RMSNorm
         
     | 
| 
       46 
45 
     | 
    
         
             
            from sglang.srt.layers.linear import (
         
     | 
| 
       47 
     | 
    
         
            -
                ColumnParallelLinear,
         
     | 
| 
       48 
46 
     | 
    
         
             
                MergedColumnParallelLinear,
         
     | 
| 
       49 
47 
     | 
    
         
             
                QKVParallelLinear,
         
     | 
| 
       50 
     | 
    
         
            -
                ReplicatedLinear,
         
     | 
| 
       51 
48 
     | 
    
         
             
                RowParallelLinear,
         
     | 
| 
       52 
49 
     | 
    
         
             
            )
         
     | 
| 
       53 
50 
     | 
    
         
             
            from sglang.srt.layers.logits_processor import LogitsProcessor
         
     | 
| 
       54 
     | 
    
         
            -
            from sglang.srt.layers.moe import  
     | 
| 
      
 51 
     | 
    
         
            +
            from sglang.srt.layers.moe import get_moe_a2a_backend
         
     | 
| 
       55 
52 
     | 
    
         
             
            from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
         
     | 
| 
       56 
53 
     | 
    
         
             
            from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
         
     | 
| 
       57 
54 
     | 
    
         
             
            from sglang.srt.layers.moe.topk import TopK
         
     | 
| 
       58 
55 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
       59 
     | 
    
         
            -
            from sglang.srt.layers.quantization.fp8_kernel import  
     | 
| 
       60 
     | 
    
         
            -
                is_fp8_fnuz,
         
     | 
| 
       61 
     | 
    
         
            -
                per_tensor_quant_mla_fp8,
         
     | 
| 
       62 
     | 
    
         
            -
                per_token_group_quant_mla_deep_gemm_masked_fp8,
         
     | 
| 
       63 
     | 
    
         
            -
            )
         
     | 
| 
      
 56 
     | 
    
         
            +
            from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
         
     | 
| 
       64 
57 
     | 
    
         
             
            from sglang.srt.layers.radix_attention import RadixAttention
         
     | 
| 
       65 
58 
     | 
    
         
             
            from sglang.srt.layers.rotary_embedding import get_rope
         
     | 
| 
       66 
59 
     | 
    
         
             
            from sglang.srt.layers.vocab_parallel_embedding import (
         
     | 
| 
       67 
60 
     | 
    
         
             
                ParallelLMHead,
         
     | 
| 
       68 
61 
     | 
    
         
             
                VocabParallelEmbedding,
         
     | 
| 
       69 
62 
     | 
    
         
             
            )
         
     | 
| 
       70 
     | 
    
         
            -
            from sglang.srt.managers.schedule_batch import global_server_args_dict
         
     | 
| 
       71 
63 
     | 
    
         
             
            from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
         
     | 
| 
       72 
64 
     | 
    
         
             
            from sglang.srt.model_executor.forward_batch_info import ForwardBatch
         
     | 
| 
       73 
65 
     | 
    
         
             
            from sglang.srt.model_loader.weight_utils import default_weight_loader
         
     | 
| 
         @@ -77,21 +69,17 @@ from sglang.srt.models.deepseek_v2 import ( 
     | 
|
| 
       77 
69 
     | 
    
         
             
                DeepseekV2Model,
         
     | 
| 
       78 
70 
     | 
    
         
             
                DeepseekV2MoE,
         
     | 
| 
       79 
71 
     | 
    
         
             
            )
         
     | 
| 
       80 
     | 
    
         
            -
            from sglang.srt. 
     | 
| 
      
 72 
     | 
    
         
            +
            from sglang.srt.server_args import get_global_server_args
         
     | 
| 
       81 
73 
     | 
    
         
             
            from sglang.srt.utils import (
         
     | 
| 
       82 
74 
     | 
    
         
             
                BumpAllocator,
         
     | 
| 
       83 
75 
     | 
    
         
             
                LazyValue,
         
     | 
| 
       84 
76 
     | 
    
         
             
                add_prefix,
         
     | 
| 
       85 
     | 
    
         
            -
                bind_or_assign,
         
     | 
| 
       86 
77 
     | 
    
         
             
                cpu_has_amx_support,
         
     | 
| 
       87 
78 
     | 
    
         
             
                get_bool_env_var,
         
     | 
| 
       88 
79 
     | 
    
         
             
                get_device_sm,
         
     | 
| 
       89 
     | 
    
         
            -
                get_int_env_var,
         
     | 
| 
       90 
80 
     | 
    
         
             
                is_cpu,
         
     | 
| 
       91 
81 
     | 
    
         
             
                is_cuda,
         
     | 
| 
       92 
     | 
    
         
            -
                is_flashinfer_available,
         
     | 
| 
       93 
82 
     | 
    
         
             
                is_hip,
         
     | 
| 
       94 
     | 
    
         
            -
                is_non_idle_and_non_empty,
         
     | 
| 
       95 
83 
     | 
    
         
             
                log_info_on_rank0,
         
     | 
| 
       96 
84 
     | 
    
         
             
                use_intel_amx_backend,
         
     | 
| 
       97 
85 
     | 
    
         
             
            )
         
     | 
| 
         @@ -395,7 +383,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE): 
     | 
|
| 
       395 
383 
     | 
    
         
             
                    self.n_shared_experts = config.n_shared_experts
         
     | 
| 
       396 
384 
     | 
    
         
             
                    self.num_fused_shared_experts = (
         
     | 
| 
       397 
385 
     | 
    
         
             
                        0
         
     | 
| 
       398 
     | 
    
         
            -
                        if  
     | 
| 
      
 386 
     | 
    
         
            +
                        if get_global_server_args().disable_shared_experts_fusion
         
     | 
| 
       399 
387 
     | 
    
         
             
                        else config.n_shared_experts
         
     | 
| 
       400 
388 
     | 
    
         
             
                    )
         
     | 
| 
       401 
389 
     | 
    
         
             
                    self.config = config
         
     | 
| 
         @@ -432,7 +420,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE): 
     | 
|
| 
       432 
420 
     | 
    
         
             
                    self.experts = get_moe_impl_class(quant_config)(
         
     | 
| 
       433 
421 
     | 
    
         
             
                        num_experts=config.n_routed_experts
         
     | 
| 
       434 
422 
     | 
    
         
             
                        + self.num_fused_shared_experts
         
     | 
| 
       435 
     | 
    
         
            -
                        +  
     | 
| 
      
 423 
     | 
    
         
            +
                        + get_global_server_args().ep_num_redundant_experts,
         
     | 
| 
       436 
424 
     | 
    
         
             
                        num_fused_shared_experts=self.num_fused_shared_experts,
         
     | 
| 
       437 
425 
     | 
    
         
             
                        top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
         
     | 
| 
       438 
426 
     | 
    
         
             
                        hidden_size=config.hidden_size,
         
     | 
| 
         @@ -471,12 +459,12 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE): 
     | 
|
| 
       471 
459 
     | 
    
         | 
| 
       472 
460 
     | 
    
         
             
                    self.top_k = config.num_experts_per_tok
         
     | 
| 
       473 
461 
     | 
    
         | 
| 
       474 
     | 
    
         
            -
                    if get_moe_a2a_backend().is_deepep():
         
     | 
| 
      
 462 
     | 
    
         
            +
                    if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
         
     | 
| 
       475 
463 
     | 
    
         
             
                        # TODO: we will support tp < ep in the future
         
     | 
| 
       476 
464 
     | 
    
         
             
                        self.ep_size = get_moe_expert_parallel_world_size()
         
     | 
| 
       477 
465 
     | 
    
         
             
                        self.num_experts = (
         
     | 
| 
       478 
466 
     | 
    
         
             
                            config.n_routed_experts
         
     | 
| 
       479 
     | 
    
         
            -
                            +  
     | 
| 
      
 467 
     | 
    
         
            +
                            + get_global_server_args().ep_num_redundant_experts
         
     | 
| 
       480 
468 
     | 
    
         
             
                        )
         
     | 
| 
       481 
469 
     | 
    
         
             
                        self.renormalize = config.norm_topk_prob
         
     | 
| 
       482 
470 
     | 
    
         
             
                        self.topk_group = config.topk_group
         
     | 
| 
         @@ -487,20 +475,9 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE): 
     | 
|
| 
       487 
475 
     | 
    
         
             
                            else None
         
     | 
| 
       488 
476 
     | 
    
         
             
                        )
         
     | 
| 
       489 
477 
     | 
    
         | 
| 
       490 
     | 
    
         
            -
             
     | 
| 
       491 
     | 
    
         
            -
             
     | 
| 
       492 
     | 
    
         
            -
             
     | 
| 
       493 
     | 
    
         
            -
                            permute_fusion=True,
         
     | 
| 
       494 
     | 
    
         
            -
                            num_experts=self.num_experts,
         
     | 
| 
       495 
     | 
    
         
            -
                            num_local_experts=config.n_routed_experts // self.tp_size,
         
     | 
| 
       496 
     | 
    
         
            -
                            hidden_size=config.hidden_size,
         
     | 
| 
       497 
     | 
    
         
            -
                            params_dtype=config.torch_dtype,
         
     | 
| 
       498 
     | 
    
         
            -
                            deepep_mode=get_deepep_mode(),
         
     | 
| 
       499 
     | 
    
         
            -
                            async_finish=True,
         
     | 
| 
       500 
     | 
    
         
            -
                            return_recv_hook=True,
         
     | 
| 
       501 
     | 
    
         
            -
                        )
         
     | 
| 
       502 
     | 
    
         
            -
             
     | 
| 
       503 
     | 
    
         
            -
                    self._enable_deepep_moe = get_moe_a2a_backend().is_deepep()
         
     | 
| 
      
 478 
     | 
    
         
            +
                    self._enable_a2a_moe = (
         
     | 
| 
      
 479 
     | 
    
         
            +
                        get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
         
     | 
| 
      
 480 
     | 
    
         
            +
                    )
         
     | 
| 
       504 
481 
     | 
    
         | 
| 
       505 
482 
     | 
    
         
             
                def forward_normal_dual_stream(
         
     | 
| 
       506 
483 
     | 
    
         
             
                    self,
         
     | 
| 
         @@ -664,7 +641,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer): 
     | 
|
| 
       664 
641 
     | 
    
         
             
                        layer_scatter_modes=self.layer_scatter_modes,
         
     | 
| 
       665 
642 
     | 
    
         
             
                        input_layernorm=self.input_layernorm,
         
     | 
| 
       666 
643 
     | 
    
         
             
                        post_attention_layernorm=self.post_attention_layernorm,
         
     | 
| 
       667 
     | 
    
         
            -
                        allow_reduce_scatter= 
     | 
| 
      
 644 
     | 
    
         
            +
                        allow_reduce_scatter=False,
         
     | 
| 
       668 
645 
     | 
    
         
             
                    )
         
     | 
| 
       669 
646 
     | 
    
         | 
| 
       670 
647 
     | 
    
         
             
                def forward(
         
     | 
| 
         @@ -758,7 +735,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM): 
     | 
|
| 
       758 
735 
     | 
    
         
             
                        config.hidden_size,
         
     | 
| 
       759 
736 
     | 
    
         
             
                        quant_config=quant_config,
         
     | 
| 
       760 
737 
     | 
    
         
             
                        prefix=add_prefix("lm_head", prefix),
         
     | 
| 
       761 
     | 
    
         
            -
                        use_attn_tp_group= 
     | 
| 
      
 738 
     | 
    
         
            +
                        use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
         
     | 
| 
       762 
739 
     | 
    
         
             
                    )
         
     | 
| 
       763 
740 
     | 
    
         
             
                    self.logits_processor = LogitsProcessor(config)
         
     | 
| 
       764 
741 
     | 
    
         | 
| 
         @@ -774,7 +751,7 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM): 
     | 
|
| 
       774 
751 
     | 
    
         
             
                    self, architecture: str = "Glm4MoeForCausalLM"
         
     | 
| 
       775 
752 
     | 
    
         
             
                ):
         
     | 
| 
       776 
753 
     | 
    
         
             
                    self.num_fused_shared_experts = 0
         
     | 
| 
       777 
     | 
    
         
            -
                    if  
     | 
| 
      
 754 
     | 
    
         
            +
                    if get_global_server_args().disable_shared_experts_fusion:
         
     | 
| 
       778 
755 
     | 
    
         
             
                        return
         
     | 
| 
       779 
756 
     | 
    
         | 
| 
       780 
757 
     | 
    
         
             
                    # Only Deepseek V3/R1 can use shared experts fusion optimization now.
         
     | 
| 
         @@ -785,12 +762,12 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM): 
     | 
|
| 
       785 
762 
     | 
    
         
             
                        or self.config.architectures[0] != architecture
         
     | 
| 
       786 
763 
     | 
    
         
             
                        or self.config.n_shared_experts != 1
         
     | 
| 
       787 
764 
     | 
    
         
             
                    ):
         
     | 
| 
       788 
     | 
    
         
            -
                        disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
         
     | 
| 
      
 765 
     | 
    
         
            +
                        disable_reason = "Only GLM-4.5 or GLM-4.6 on NV-platform with capability >= 80 can use shared experts fusion optimization."
         
     | 
| 
       789 
766 
     | 
    
         
             
                    elif get_moe_expert_parallel_world_size() > 1:
         
     | 
| 
       790 
     | 
    
         
            -
                        disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
         
     | 
| 
      
 767 
     | 
    
         
            +
                        disable_reason = "Deepseek and GLM-4.5 or GLM-4.6 can not use shared experts fusion optimization under expert parallelism."
         
     | 
| 
       791 
768 
     | 
    
         | 
| 
       792 
769 
     | 
    
         
             
                    if disable_reason is not None:
         
     | 
| 
       793 
     | 
    
         
            -
                         
     | 
| 
      
 770 
     | 
    
         
            +
                        get_global_server_args().disable_shared_experts_fusion = True
         
     | 
| 
       794 
771 
     | 
    
         
             
                        self.num_fused_shared_experts = 0
         
     | 
| 
       795 
772 
     | 
    
         
             
                        log_info_on_rank0(
         
     | 
| 
       796 
773 
     | 
    
         
             
                            logger,
         
     | 
| 
         @@ -12,7 +12,7 @@ 
     | 
|
| 
       12 
12 
     | 
    
         
             
            # limitations under the License.
         
     | 
| 
       13 
13 
     | 
    
         
             
            # ==============================================================================
         
     | 
| 
       14 
14 
     | 
    
         | 
| 
       15 
     | 
    
         
            -
            """Inference-only GLM-4.5 NextN Speculative Decoding."""
         
     | 
| 
      
 15 
     | 
    
         
            +
            """Inference-only GLM-4.5, GLM-4.6 NextN Speculative Decoding."""
         
     | 
| 
       16 
16 
     | 
    
         
             
            import logging
         
     | 
| 
       17 
17 
     | 
    
         
             
            from typing import Iterable, Optional, Tuple
         
     | 
| 
       18 
18 
     | 
    
         | 
| 
         @@ -30,9 +30,9 @@ from sglang.srt.layers.vocab_parallel_embedding import ( 
     | 
|
| 
       30 
30 
     | 
    
         
             
                ParallelLMHead,
         
     | 
| 
       31 
31 
     | 
    
         
             
                VocabParallelEmbedding,
         
     | 
| 
       32 
32 
     | 
    
         
             
            )
         
     | 
| 
       33 
     | 
    
         
            -
            from sglang.srt.managers.schedule_batch import global_server_args_dict
         
     | 
| 
       34 
33 
     | 
    
         
             
            from sglang.srt.model_executor.forward_batch_info import ForwardBatch
         
     | 
| 
       35 
34 
     | 
    
         
             
            from sglang.srt.models.glm4_moe import Glm4MoeDecoderLayer, Glm4MoeForCausalLM
         
     | 
| 
      
 35 
     | 
    
         
            +
            from sglang.srt.server_args import get_global_server_args
         
     | 
| 
       36 
36 
     | 
    
         
             
            from sglang.srt.utils import BumpAllocator, add_prefix
         
     | 
| 
       37 
37 
     | 
    
         | 
| 
       38 
38 
     | 
    
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 
         @@ -48,7 +48,7 @@ class Glm4MoeModelNextN(nn.Module): 
     | 
|
| 
       48 
48 
     | 
    
         
             
                    super().__init__()
         
     | 
| 
       49 
49 
     | 
    
         
             
                    if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
         
     | 
| 
       50 
50 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       51 
     | 
    
         
            -
                            "Overriding Glm4MoeForCausalLMNextN quant config for modelopt_fp4 GLM-4.5 model."
         
     | 
| 
      
 51 
     | 
    
         
            +
                            "Overriding Glm4MoeForCausalLMNextN quant config for modelopt_fp4 GLM-4.5 / GLM-4.6 model."
         
     | 
| 
       52 
52 
     | 
    
         
             
                        )
         
     | 
| 
       53 
53 
     | 
    
         
             
                        quant_config = None
         
     | 
| 
       54 
54 
     | 
    
         | 
| 
         @@ -145,7 +145,7 @@ class Glm4MoeForCausalLMNextN(Glm4MoeForCausalLM): 
     | 
|
| 
       145 
145 
     | 
    
         
             
                        config.hidden_size,
         
     | 
| 
       146 
146 
     | 
    
         
             
                        quant_config=quant_config,
         
     | 
| 
       147 
147 
     | 
    
         
             
                        prefix=add_prefix("model.shared_head.head", prefix),
         
     | 
| 
       148 
     | 
    
         
            -
                        use_attn_tp_group= 
     | 
| 
      
 148 
     | 
    
         
            +
                        use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
         
     | 
| 
       149 
149 
     | 
    
         
             
                    )
         
     | 
| 
       150 
150 
     | 
    
         
             
                    self.logits_processor = LogitsProcessor(config)
         
     | 
| 
       151 
151 
     | 
    
         | 
    
        sglang/srt/models/glm4v.py
    CHANGED
    
    | 
         @@ -7,9 +7,9 @@ import torch.nn as nn 
     | 
|
| 
       7 
7 
     | 
    
         
             
            import torch.nn.functional as F
         
     | 
| 
       8 
8 
     | 
    
         
             
            from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisionConfig
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
            from sglang.srt.hf_transformers_utils import get_processor
         
     | 
| 
       11 
10 
     | 
    
         
             
            from sglang.srt.layers.activation import SiluAndMul
         
     | 
| 
       12 
11 
     | 
    
         
             
            from sglang.srt.layers.attention import vision_utils
         
     | 
| 
      
 12 
     | 
    
         
            +
            from sglang.srt.layers.dp_attention import get_attention_tp_size
         
     | 
| 
       13 
13 
     | 
    
         
             
            from sglang.srt.layers.layernorm import RMSNorm
         
     | 
| 
       14 
14 
     | 
    
         
             
            from sglang.srt.layers.linear import (
         
     | 
| 
       15 
15 
     | 
    
         
             
                ColumnParallelLinear,
         
     | 
| 
         @@ -28,6 +28,7 @@ from sglang.srt.models.qwen2_5_vl import ( 
     | 
|
| 
       28 
28 
     | 
    
         
             
                Qwen2_5_VLForConditionalGeneration,
         
     | 
| 
       29 
29 
     | 
    
         
             
            )
         
     | 
| 
       30 
30 
     | 
    
         
             
            from sglang.srt.utils import add_prefix
         
     | 
| 
      
 31 
     | 
    
         
            +
            from sglang.srt.utils.hf_transformers_utils import get_processor
         
     | 
| 
       31 
32 
     | 
    
         | 
| 
       32 
33 
     | 
    
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 
       33 
34 
     | 
    
         | 
| 
         @@ -434,7 +435,7 @@ class Glm4vVisionModel(nn.Module): 
     | 
|
| 
       434 
435 
     | 
    
         
             
                    cu_seqlens = torch.repeat_interleave(
         
     | 
| 
       435 
436 
     | 
    
         
             
                        grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
         
     | 
| 
       436 
437 
     | 
    
         
             
                    ).cumsum(dim=0, dtype=torch.int32)
         
     | 
| 
       437 
     | 
    
         
            -
                    cu_seqlens =  
     | 
| 
      
 438 
     | 
    
         
            +
                    cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
         
     | 
| 
       438 
439 
     | 
    
         | 
| 
       439 
440 
     | 
    
         
             
                    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         
     | 
| 
       440 
441 
     | 
    
         
             
                    x = self.embeddings(
         
     | 
    
        sglang/srt/models/glm4v_moe.py
    CHANGED
    
    | 
         @@ -10,18 +10,18 @@ from sglang.srt.distributed import ( 
     | 
|
| 
       10 
10 
     | 
    
         
             
                get_moe_expert_parallel_world_size,
         
     | 
| 
       11 
11 
     | 
    
         
             
                get_tensor_model_parallel_world_size,
         
     | 
| 
       12 
12 
     | 
    
         
             
            )
         
     | 
| 
       13 
     | 
    
         
            -
            from sglang.srt.hf_transformers_utils import get_processor
         
     | 
| 
       14 
13 
     | 
    
         
             
            from sglang.srt.layers.attention import vision_utils
         
     | 
| 
       15 
14 
     | 
    
         
             
            from sglang.srt.layers.logits_processor import LogitsProcessor
         
     | 
| 
       16 
15 
     | 
    
         
             
            from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         
     | 
| 
       17 
16 
     | 
    
         
             
            from sglang.srt.layers.pooler import Pooler, PoolingType
         
     | 
| 
       18 
17 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
       19 
18 
     | 
    
         
             
            from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
         
     | 
| 
       20 
     | 
    
         
            -
            from sglang.srt.managers.schedule_batch import global_server_args_dict
         
     | 
| 
       21 
19 
     | 
    
         
             
            from sglang.srt.model_loader.weight_utils import default_weight_loader
         
     | 
| 
       22 
20 
     | 
    
         
             
            from sglang.srt.models.glm4_moe import Glm4MoeModel
         
     | 
| 
       23 
21 
     | 
    
         
             
            from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
         
     | 
| 
      
 22 
     | 
    
         
            +
            from sglang.srt.server_args import get_global_server_args
         
     | 
| 
       24 
23 
     | 
    
         
             
            from sglang.srt.utils import add_prefix, is_cuda, log_info_on_rank0
         
     | 
| 
      
 24 
     | 
    
         
            +
            from sglang.srt.utils.hf_transformers_utils import get_processor
         
     | 
| 
       25 
25 
     | 
    
         | 
| 
       26 
26 
     | 
    
         
             
            _is_cuda = is_cuda()
         
     | 
| 
       27 
27 
     | 
    
         | 
| 
         @@ -47,7 +47,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration): 
     | 
|
| 
       47 
47 
     | 
    
         
             
                    self.determine_num_fused_shared_experts("Glm4MoeForCausalLM")
         
     | 
| 
       48 
48 
     | 
    
         
             
                    self.num_fused_shared_experts = (
         
     | 
| 
       49 
49 
     | 
    
         
             
                        0
         
     | 
| 
       50 
     | 
    
         
            -
                        if  
     | 
| 
      
 50 
     | 
    
         
            +
                        if get_global_server_args().disable_shared_experts_fusion
         
     | 
| 
       51 
51 
     | 
    
         
             
                        else config.n_shared_experts
         
     | 
| 
       52 
52 
     | 
    
         
             
                    )
         
     | 
| 
       53 
53 
     | 
    
         | 
| 
         @@ -68,7 +68,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration): 
     | 
|
| 
       68 
68 
     | 
    
         
             
                        config.hidden_size,
         
     | 
| 
       69 
69 
     | 
    
         
             
                        quant_config=quant_config,
         
     | 
| 
       70 
70 
     | 
    
         
             
                        prefix=add_prefix("lm_head", prefix),
         
     | 
| 
       71 
     | 
    
         
            -
                        use_attn_tp_group= 
     | 
| 
      
 71 
     | 
    
         
            +
                        use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
         
     | 
| 
       72 
72 
     | 
    
         
             
                    )
         
     | 
| 
       73 
73 
     | 
    
         
             
                    self.logits_processor = LogitsProcessor(config)
         
     | 
| 
       74 
74 
     | 
    
         
             
                    self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
         
     | 
| 
         @@ -81,7 +81,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration): 
     | 
|
| 
       81 
81 
     | 
    
         
             
                    self, architecture: str = "Glm4MoeForCausalLM"
         
     | 
| 
       82 
82 
     | 
    
         
             
                ):
         
     | 
| 
       83 
83 
     | 
    
         
             
                    self.num_fused_shared_experts = 0
         
     | 
| 
       84 
     | 
    
         
            -
                    if  
     | 
| 
      
 84 
     | 
    
         
            +
                    if get_global_server_args().disable_shared_experts_fusion:
         
     | 
| 
       85 
85 
     | 
    
         
             
                        return
         
     | 
| 
       86 
86 
     | 
    
         | 
| 
       87 
87 
     | 
    
         
             
                    # Only Deepseek V3/R1 can use shared experts fusion optimization now.
         
     | 
| 
         @@ -97,7 +97,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration): 
     | 
|
| 
       97 
97 
     | 
    
         
             
                        disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
         
     | 
| 
       98 
98 
     | 
    
         | 
| 
       99 
99 
     | 
    
         
             
                    if disable_reason is not None:
         
     | 
| 
       100 
     | 
    
         
            -
                         
     | 
| 
      
 100 
     | 
    
         
            +
                        get_global_server_args().disable_shared_experts_fusion = True
         
     | 
| 
       101 
101 
     | 
    
         
             
                        self.num_fused_shared_experts = 0
         
     | 
| 
       102 
102 
     | 
    
         
             
                        log_info_on_rank0(
         
     | 
| 
       103 
103 
     | 
    
         
             
                            logger,
         
     | 
    
        sglang/srt/models/gpt_oss.py
    CHANGED
    
    | 
         @@ -63,9 +63,13 @@ from sglang.srt.layers.vocab_parallel_embedding import ( 
     | 
|
| 
       63 
63 
     | 
    
         
             
                ParallelLMHead,
         
     | 
| 
       64 
64 
     | 
    
         
             
                VocabParallelEmbedding,
         
     | 
| 
       65 
65 
     | 
    
         
             
            )
         
     | 
| 
       66 
     | 
    
         
            -
            from sglang.srt.managers.schedule_batch import global_server_args_dict
         
     | 
| 
       67 
66 
     | 
    
         
             
            from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
         
     | 
| 
       68 
67 
     | 
    
         
             
            from sglang.srt.model_loader.weight_utils import default_weight_loader
         
     | 
| 
      
 68 
     | 
    
         
            +
            from sglang.srt.models.utils import (
         
     | 
| 
      
 69 
     | 
    
         
            +
                create_fused_set_kv_buffer_arg,
         
     | 
| 
      
 70 
     | 
    
         
            +
                enable_fused_set_kv_buffer,
         
     | 
| 
      
 71 
     | 
    
         
            +
            )
         
     | 
| 
      
 72 
     | 
    
         
            +
            from sglang.srt.server_args import get_global_server_args
         
     | 
| 
       69 
73 
     | 
    
         
             
            from sglang.srt.utils import (
         
     | 
| 
       70 
74 
     | 
    
         
             
                LazyValue,
         
     | 
| 
       71 
75 
     | 
    
         
             
                add_prefix,
         
     | 
| 
         @@ -81,7 +85,7 @@ _is_sm100_supported = is_cuda() and is_sm100_supported() 
     | 
|
| 
       81 
85 
     | 
    
         | 
| 
       82 
86 
     | 
    
         | 
| 
       83 
87 
     | 
    
         
             
            if _is_cuda:
         
     | 
| 
       84 
     | 
    
         
            -
                from sgl_kernel import FusedSetKVBufferArg
         
     | 
| 
      
 88 
     | 
    
         
            +
                from sgl_kernel import FusedSetKVBufferArg  # noqa: F401
         
     | 
| 
       85 
89 
     | 
    
         | 
| 
       86 
90 
     | 
    
         | 
| 
       87 
91 
     | 
    
         
             
            class GptOssConfig(PretrainedConfig):
         
     | 
| 
         @@ -134,7 +138,7 @@ class GptOssSparseMoeBlock(nn.Module): 
     | 
|
| 
       134 
138 
     | 
    
         
             
                        }
         
     | 
| 
       135 
139 
     | 
    
         
             
                    self.experts = experts_type(
         
     | 
| 
       136 
140 
     | 
    
         
             
                        num_experts=config.num_local_experts
         
     | 
| 
       137 
     | 
    
         
            -
                        +  
     | 
| 
      
 141 
     | 
    
         
            +
                        + get_global_server_args().ep_num_redundant_experts,
         
     | 
| 
       138 
142 
     | 
    
         
             
                        top_k=config.num_experts_per_tok,
         
     | 
| 
       139 
143 
     | 
    
         
             
                        layer_id=layer_id,
         
     | 
| 
       140 
144 
     | 
    
         
             
                        hidden_size=config.hidden_size,
         
     | 
| 
         @@ -193,33 +197,6 @@ class GptOssSparseMoeBlock(nn.Module): 
     | 
|
| 
       193 
197 
     | 
    
         
             
                    return ans
         
     | 
| 
       194 
198 
     | 
    
         | 
| 
       195 
199 
     | 
    
         | 
| 
       196 
     | 
    
         
            -
            def _enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
         
     | 
| 
       197 
     | 
    
         
            -
                """Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
         
     | 
| 
       198 
     | 
    
         
            -
                return _is_cuda and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
         
     | 
| 
       199 
     | 
    
         
            -
             
     | 
| 
       200 
     | 
    
         
            -
             
     | 
| 
       201 
     | 
    
         
            -
            # TODO maybe move to a model-common utils
         
     | 
| 
       202 
     | 
    
         
            -
            def _create_fused_set_kv_buffer_arg(
         
     | 
| 
       203 
     | 
    
         
            -
                value: torch.Tensor,
         
     | 
| 
       204 
     | 
    
         
            -
                layer: RadixAttention,
         
     | 
| 
       205 
     | 
    
         
            -
                forward_batch: ForwardBatch,
         
     | 
| 
       206 
     | 
    
         
            -
            ):
         
     | 
| 
       207 
     | 
    
         
            -
                layer_id = layer.layer_id
         
     | 
| 
       208 
     | 
    
         
            -
                token_to_kv_pool = forward_batch.token_to_kv_pool
         
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
                k_buffer = token_to_kv_pool.get_key_buffer(layer_id)
         
     | 
| 
       211 
     | 
    
         
            -
                v_buffer = token_to_kv_pool.get_value_buffer(layer_id)
         
     | 
| 
       212 
     | 
    
         
            -
             
     | 
| 
       213 
     | 
    
         
            -
                return FusedSetKVBufferArg(
         
     | 
| 
       214 
     | 
    
         
            -
                    value=value,
         
     | 
| 
       215 
     | 
    
         
            -
                    k_buffer=k_buffer.view(k_buffer.shape[0], -1),
         
     | 
| 
       216 
     | 
    
         
            -
                    v_buffer=v_buffer.view(v_buffer.shape[0], -1),
         
     | 
| 
       217 
     | 
    
         
            -
                    k_scale=layer.k_scale,
         
     | 
| 
       218 
     | 
    
         
            -
                    v_scale=layer.v_scale,
         
     | 
| 
       219 
     | 
    
         
            -
                    cache_loc=forward_batch.out_cache_loc,
         
     | 
| 
       220 
     | 
    
         
            -
                )
         
     | 
| 
       221 
     | 
    
         
            -
             
     | 
| 
       222 
     | 
    
         
            -
             
     | 
| 
       223 
200 
     | 
    
         
             
            class GptOssAttention(nn.Module):
         
     | 
| 
       224 
201 
     | 
    
         
             
                def __init__(
         
     | 
| 
       225 
202 
     | 
    
         
             
                    self,
         
     | 
| 
         @@ -282,7 +259,7 @@ class GptOssAttention(nn.Module): 
     | 
|
| 
       282 
259 
     | 
    
         | 
| 
       283 
260 
     | 
    
         
             
                    # Choose dtype of sinks based on attention backend: trtllm_mha requires float32,
         
     | 
| 
       284 
261 
     | 
    
         
             
                    # others can use bfloat16
         
     | 
| 
       285 
     | 
    
         
            -
                    attn_backend =  
     | 
| 
      
 262 
     | 
    
         
            +
                    attn_backend = get_global_server_args().attention_backend
         
     | 
| 
       286 
263 
     | 
    
         
             
                    sinks_dtype = torch.float32 if attn_backend == "trtllm_mha" else torch.bfloat16
         
     | 
| 
       287 
264 
     | 
    
         
             
                    self.sinks = nn.Parameter(
         
     | 
| 
       288 
265 
     | 
    
         
             
                        torch.empty(self.num_heads, dtype=sinks_dtype), requires_grad=False
         
     | 
| 
         @@ -337,12 +314,12 @@ class GptOssAttention(nn.Module): 
     | 
|
| 
       337 
314 
     | 
    
         
             
                        q,
         
     | 
| 
       338 
315 
     | 
    
         
             
                        k,
         
     | 
| 
       339 
316 
     | 
    
         
             
                        fused_set_kv_buffer_arg=(
         
     | 
| 
       340 
     | 
    
         
            -
                             
     | 
| 
      
 317 
     | 
    
         
            +
                            create_fused_set_kv_buffer_arg(
         
     | 
| 
       341 
318 
     | 
    
         
             
                                value=v,
         
     | 
| 
       342 
319 
     | 
    
         
             
                                layer=self.attn,
         
     | 
| 
       343 
320 
     | 
    
         
             
                                forward_batch=forward_batch,
         
     | 
| 
       344 
321 
     | 
    
         
             
                            )
         
     | 
| 
       345 
     | 
    
         
            -
                            if  
     | 
| 
      
 322 
     | 
    
         
            +
                            if enable_fused_set_kv_buffer(forward_batch)
         
     | 
| 
       346 
323 
     | 
    
         
             
                            else None
         
     | 
| 
       347 
324 
     | 
    
         
             
                        ),
         
     | 
| 
       348 
325 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -356,7 +333,7 @@ class GptOssAttention(nn.Module): 
     | 
|
| 
       356 
333 
     | 
    
         
             
                    attn_output = self.attn(
         
     | 
| 
       357 
334 
     | 
    
         
             
                        *inner_state,
         
     | 
| 
       358 
335 
     | 
    
         
             
                        sinks=self.sinks,
         
     | 
| 
       359 
     | 
    
         
            -
                        save_kv_cache=not  
     | 
| 
      
 336 
     | 
    
         
            +
                        save_kv_cache=not enable_fused_set_kv_buffer(forward_batch),
         
     | 
| 
       360 
337 
     | 
    
         
             
                    )
         
     | 
| 
       361 
338 
     | 
    
         
             
                    output, _ = self.o_proj(attn_output)
         
     | 
| 
       362 
339 
     | 
    
         
             
                    return output
         
     | 
| 
         @@ -614,7 +591,7 @@ class GptOssForCausalLM(nn.Module): 
     | 
|
| 
       614 
591 
     | 
    
         
             
                        config.hidden_size,
         
     | 
| 
       615 
592 
     | 
    
         
             
                        # quant_config=quant_config,
         
     | 
| 
       616 
593 
     | 
    
         
             
                        prefix=add_prefix("lm_head", prefix),
         
     | 
| 
       617 
     | 
    
         
            -
                        use_attn_tp_group= 
     | 
| 
      
 594 
     | 
    
         
            +
                        use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
         
     | 
| 
       618 
595 
     | 
    
         
             
                    )
         
     | 
| 
       619 
596 
     | 
    
         
             
                    self.logits_processor = LogitsProcessor(config)
         
     | 
| 
       620 
597 
     | 
    
         
             
                    self.capture_aux_hidden_states = False
         
     | 
    
        sglang/srt/models/grok.py
    CHANGED
    
    | 
         @@ -28,7 +28,6 @@ from torch import nn 
     | 
|
| 
       28 
28 
     | 
    
         
             
            from transformers import PretrainedConfig
         
     | 
| 
       29 
29 
     | 
    
         | 
| 
       30 
30 
     | 
    
         
             
            from sglang.srt.distributed import (
         
     | 
| 
       31 
     | 
    
         
            -
                get_moe_expert_parallel_world_size,
         
     | 
| 
       32 
31 
     | 
    
         
             
                get_tensor_model_parallel_rank,
         
     | 
| 
       33 
32 
     | 
    
         
             
                get_tensor_model_parallel_world_size,
         
     | 
| 
       34 
33 
     | 
    
         
             
                tensor_model_parallel_all_gather,
         
     | 
| 
         @@ -36,7 +35,6 @@ from sglang.srt.distributed import ( 
     | 
|
| 
       36 
35 
     | 
    
         
             
            )
         
     | 
| 
       37 
36 
     | 
    
         
             
            from sglang.srt.layers.activation import GeluAndMul
         
     | 
| 
       38 
37 
     | 
    
         
             
            from sglang.srt.layers.elementwise import (
         
     | 
| 
       39 
     | 
    
         
            -
                experts_combine_triton,
         
     | 
| 
       40 
38 
     | 
    
         
             
                fused_dual_residual_rmsnorm,
         
     | 
| 
       41 
39 
     | 
    
         
             
                fused_rmsnorm,
         
     | 
| 
       42 
40 
     | 
    
         
             
                gelu_and_mul_triton,
         
     | 
| 
         @@ -49,7 +47,6 @@ from sglang.srt.layers.linear import ( 
     | 
|
| 
       49 
47 
     | 
    
         
             
                RowParallelLinear,
         
     | 
| 
       50 
48 
     | 
    
         
             
            )
         
     | 
| 
       51 
49 
     | 
    
         
             
            from sglang.srt.layers.logits_processor import LogitsProcessor
         
     | 
| 
       52 
     | 
    
         
            -
            from sglang.srt.layers.moe.ep_moe.layer import EPMoE
         
     | 
| 
       53 
50 
     | 
    
         
             
            from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         
     | 
| 
       54 
51 
     | 
    
         
             
            from sglang.srt.layers.moe.router import fused_moe_router_shim
         
     | 
| 
       55 
52 
     | 
    
         
             
            from sglang.srt.layers.moe.topk import TopK
         
     | 
| 
         @@ -65,10 +62,10 @@ from sglang.srt.layers.vocab_parallel_embedding import ( 
     | 
|
| 
       65 
62 
     | 
    
         
             
                ParallelLMHead,
         
     | 
| 
       66 
63 
     | 
    
         
             
                VocabParallelEmbedding,
         
     | 
| 
       67 
64 
     | 
    
         
             
            )
         
     | 
| 
       68 
     | 
    
         
            -
            from sglang.srt.managers.schedule_batch import global_server_args_dict
         
     | 
| 
       69 
65 
     | 
    
         
             
            from sglang.srt.model_executor.forward_batch_info import ForwardBatch
         
     | 
| 
       70 
66 
     | 
    
         
             
            from sglang.srt.model_loader.loader import DefaultModelLoader
         
     | 
| 
       71 
67 
     | 
    
         
             
            from sglang.srt.model_loader.weight_utils import default_weight_loader
         
     | 
| 
      
 68 
     | 
    
         
            +
            from sglang.srt.server_args import get_global_server_args
         
     | 
| 
       72 
69 
     | 
    
         
             
            from sglang.srt.utils import add_prefix, dispose_tensor, dump_to_file
         
     | 
| 
       73 
70 
     | 
    
         | 
| 
       74 
71 
     | 
    
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 
         @@ -76,9 +73,6 @@ logger = logging.getLogger(__name__) 
     | 
|
| 
       76 
73 
     | 
    
         | 
| 
       77 
74 
     | 
    
         
             
            # Dump tensors for debugging
         
     | 
| 
       78 
75 
     | 
    
         
             
            debug_tensor_dump_output_folder = None
         
     | 
| 
       79 
     | 
    
         
            -
            debug_tensor_dump_prefill_only = False
         
     | 
| 
       80 
     | 
    
         
            -
            # Skip all the other tensor dumps, only dump the target logits
         
     | 
| 
       81 
     | 
    
         
            -
            debug_tensor_dump_only_target_logprobs = False
         
     | 
| 
       82 
76 
     | 
    
         
             
            debug_tensor_dump_inject = False
         
     | 
| 
       83 
77 
     | 
    
         
             
            debug_tensor_dump_layers = None
         
     | 
| 
       84 
78 
     | 
    
         
             
            debug_tensor_dump_test = False
         
     | 
| 
         @@ -176,17 +170,7 @@ class Grok1MoE(nn.Module): 
     | 
|
| 
       176 
170 
     | 
    
         
             
                        custom_routing_function=custom_routing_function,
         
     | 
| 
       177 
171 
     | 
    
         
             
                    )
         
     | 
| 
       178 
172 
     | 
    
         | 
| 
       179 
     | 
    
         
            -
                     
     | 
| 
       180 
     | 
    
         
            -
                    if get_moe_expert_parallel_world_size() > 1:
         
     | 
| 
       181 
     | 
    
         
            -
                        MoEImpl = EPMoE
         
     | 
| 
       182 
     | 
    
         
            -
                    else:
         
     | 
| 
       183 
     | 
    
         
            -
                        MoEImpl = FusedMoE
         
     | 
| 
       184 
     | 
    
         
            -
                        kwargs["reduce_results"] = reduce_results
         
     | 
| 
       185 
     | 
    
         
            -
                        kwargs["use_presharded_weights"] = use_presharded_weights
         
     | 
| 
       186 
     | 
    
         
            -
                        kwargs["inplace"] = inplace
         
     | 
| 
       187 
     | 
    
         
            -
                        kwargs["no_combine"] = no_combine
         
     | 
| 
       188 
     | 
    
         
            -
             
     | 
| 
       189 
     | 
    
         
            -
                    self.experts = MoEImpl(
         
     | 
| 
      
 173 
     | 
    
         
            +
                    self.experts = FusedMoE(
         
     | 
| 
       190 
174 
     | 
    
         
             
                        num_experts=num_experts,
         
     | 
| 
       191 
175 
     | 
    
         
             
                        top_k=top_k,
         
     | 
| 
       192 
176 
     | 
    
         
             
                        layer_id=layer_id,
         
     | 
| 
         @@ -195,7 +179,10 @@ class Grok1MoE(nn.Module): 
     | 
|
| 
       195 
179 
     | 
    
         
             
                        params_dtype=params_dtype,
         
     | 
| 
       196 
180 
     | 
    
         
             
                        quant_config=quant_config,
         
     | 
| 
       197 
181 
     | 
    
         
             
                        activation="gelu",
         
     | 
| 
       198 
     | 
    
         
            -
                         
     | 
| 
      
 182 
     | 
    
         
            +
                        reduce_results=reduce_results,
         
     | 
| 
      
 183 
     | 
    
         
            +
                        use_presharded_weights=use_presharded_weights,
         
     | 
| 
      
 184 
     | 
    
         
            +
                        inplace=inplace,
         
     | 
| 
      
 185 
     | 
    
         
            +
                        no_combine=no_combine,
         
     | 
| 
       199 
186 
     | 
    
         
             
                    )
         
     | 
| 
       200 
187 
     | 
    
         | 
| 
       201 
188 
     | 
    
         
             
                def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         
     | 
| 
         @@ -877,10 +864,10 @@ class Grok1ForCausalLM(nn.Module): 
     | 
|
| 
       877 
864 
     | 
    
         | 
| 
       878 
865 
     | 
    
         
             
                    # Dump tensors for debugging
         
     | 
| 
       879 
866 
     | 
    
         
             
                    global debug_tensor_dump_output_folder, debug_tensor_dump_inject
         
     | 
| 
       880 
     | 
    
         
            -
                    debug_tensor_dump_output_folder =  
     | 
| 
       881 
     | 
    
         
            -
                         
     | 
| 
       882 
     | 
    
         
            -
                     
     | 
| 
       883 
     | 
    
         
            -
                    debug_tensor_dump_inject =  
     | 
| 
      
 867 
     | 
    
         
            +
                    debug_tensor_dump_output_folder = (
         
     | 
| 
      
 868 
     | 
    
         
            +
                        get_global_server_args().debug_tensor_dump_output_folder
         
     | 
| 
      
 869 
     | 
    
         
            +
                    )
         
     | 
| 
      
 870 
     | 
    
         
            +
                    debug_tensor_dump_inject = get_global_server_args().debug_tensor_dump_inject
         
     | 
| 
       884 
871 
     | 
    
         
             
                    warnings.filterwarnings("ignore", category=FutureWarning)
         
     | 
| 
       885 
872 
     | 
    
         | 
| 
       886 
873 
     | 
    
         
             
                    if get_tensor_model_parallel_rank() == 0:
         
     | 
    
        sglang/srt/models/hunyuan.py
    CHANGED
    
    | 
         @@ -12,18 +12,14 @@ 
     | 
|
| 
       12 
12 
     | 
    
         
             
            # See the License for the specific language governing permissions and
         
     | 
| 
       13 
13 
     | 
    
         
             
            # limitations under the License.
         
     | 
| 
       14 
14 
     | 
    
         
             
            """Inference-only HunYuan model compatible with HuggingFace weights."""
         
     | 
| 
       15 
     | 
    
         
            -
            import logging
         
     | 
| 
       16 
15 
     | 
    
         
             
            import re
         
     | 
| 
       17 
     | 
    
         
            -
            from  
     | 
| 
       18 
     | 
    
         
            -
            from enum import Enum, auto
         
     | 
| 
       19 
     | 
    
         
            -
            from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
         
     | 
| 
      
 16 
     | 
    
         
            +
            from typing import Any, Dict, Iterable, Optional, Tuple
         
     | 
| 
       20 
17 
     | 
    
         | 
| 
       21 
18 
     | 
    
         
             
            import torch
         
     | 
| 
       22 
19 
     | 
    
         
             
            from torch import nn
         
     | 
| 
       23 
20 
     | 
    
         
             
            from transformers import PretrainedConfig
         
     | 
| 
       24 
21 
     | 
    
         | 
| 
       25 
22 
     | 
    
         
             
            from sglang.srt.distributed import (
         
     | 
| 
       26 
     | 
    
         
            -
                get_pp_group,
         
     | 
| 
       27 
23 
     | 
    
         
             
                get_tensor_model_parallel_rank,
         
     | 
| 
       28 
24 
     | 
    
         
             
                get_tensor_model_parallel_world_size,
         
     | 
| 
       29 
25 
     | 
    
         
             
                tensor_model_parallel_all_reduce,
         
     | 
| 
         @@ -46,7 +42,6 @@ from sglang.srt.layers.radix_attention import RadixAttention 
     | 
|
| 
       46 
42 
     | 
    
         
             
            from sglang.srt.layers.rotary_embedding import get_rope
         
     | 
| 
       47 
43 
     | 
    
         
             
            from sglang.srt.layers.sampler import Sampler
         
     | 
| 
       48 
44 
     | 
    
         
             
            from sglang.srt.layers.vocab_parallel_embedding import (
         
     | 
| 
       49 
     | 
    
         
            -
                DEFAULT_VOCAB_PADDING_SIZE,
         
     | 
| 
       50 
45 
     | 
    
         
             
                ParallelLMHead,
         
     | 
| 
       51 
46 
     | 
    
         
             
                VocabParallelEmbedding,
         
     | 
| 
       52 
47 
     | 
    
         
             
            )
         
     | 
| 
         @@ -56,7 +51,7 @@ from sglang.srt.model_loader.weight_utils import ( 
     | 
|
| 
       56 
51 
     | 
    
         
             
                kv_cache_scales_loader,
         
     | 
| 
       57 
52 
     | 
    
         
             
                maybe_remap_kv_scale_name,
         
     | 
| 
       58 
53 
     | 
    
         
             
            )
         
     | 
| 
       59 
     | 
    
         
            -
            from sglang.srt.utils import  
     | 
| 
      
 54 
     | 
    
         
            +
            from sglang.srt.utils import is_hip
         
     | 
| 
       60 
55 
     | 
    
         | 
| 
       61 
56 
     | 
    
         
             
            expert_distribution_recorder = ExpertDistributionRecorder()
         
     | 
| 
       62 
57 
     | 
    
         | 
    
        sglang/srt/models/interns1.py
    CHANGED
    
    | 
         @@ -5,7 +5,6 @@ from torch import nn 
     | 
|
| 
       5 
5 
     | 
    
         
             
            from transformers import PretrainedConfig
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
            from sglang.srt.layers.attention import vision_utils
         
     | 
| 
       8 
     | 
    
         
            -
            from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
         
     | 
| 
       9 
8 
     | 
    
         
             
            from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
         
     | 
| 
       10 
9 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
       11 
10 
     | 
    
         
             
            from sglang.srt.managers.mm_utils import (
         
     | 
    
        sglang/srt/models/kimi_vl.py
    CHANGED
    
    | 
         @@ -43,10 +43,8 @@ 
     | 
|
| 
       43 
43 
     | 
    
         | 
| 
       44 
44 
     | 
    
         
             
            import copy
         
     | 
| 
       45 
45 
     | 
    
         
             
            import logging
         
     | 
| 
       46 
     | 
    
         
            -
            import math
         
     | 
| 
       47 
     | 
    
         
            -
            from collections.abc import Mapping
         
     | 
| 
       48 
46 
     | 
    
         
             
            from dataclasses import dataclass
         
     | 
| 
       49 
     | 
    
         
            -
            from typing import  
     | 
| 
      
 47 
     | 
    
         
            +
            from typing import Iterable, List, Optional, Tuple
         
     | 
| 
       50 
48 
     | 
    
         | 
| 
       51 
49 
     | 
    
         
             
            import torch
         
     | 
| 
       52 
50 
     | 
    
         
             
            from torch import nn
         
     | 
| 
         @@ -56,10 +54,6 @@ from sglang.srt.configs import KimiVLConfig 
     | 
|
| 
       56 
54 
     | 
    
         
             
            from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
         
     | 
| 
       57 
55 
     | 
    
         
             
            from sglang.srt.configs.kimi_vl import KimiVLConfig
         
     | 
| 
       58 
56 
     | 
    
         
             
            from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
         
     | 
| 
       59 
     | 
    
         
            -
            from sglang.srt.distributed import (
         
     | 
| 
       60 
     | 
    
         
            -
                get_tensor_model_parallel_rank,
         
     | 
| 
       61 
     | 
    
         
            -
                get_tensor_model_parallel_world_size,
         
     | 
| 
       62 
     | 
    
         
            -
            )
         
     | 
| 
       63 
57 
     | 
    
         
             
            from sglang.srt.layers.activation import QuickGELU
         
     | 
| 
       64 
58 
     | 
    
         
             
            from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         
     | 
| 
       65 
59 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
         @@ -49,7 +49,7 @@ from typing import List, Optional, Sequence, Tuple, Union 
     | 
|
| 
       49 
49 
     | 
    
         
             
            import torch
         
     | 
| 
       50 
50 
     | 
    
         
             
            import torch.nn as nn
         
     | 
| 
       51 
51 
     | 
    
         
             
            import torch.nn.functional as F
         
     | 
| 
       52 
     | 
    
         
            -
            from transformers.activations import ACT2FN 
     | 
| 
      
 52 
     | 
    
         
            +
            from transformers.activations import ACT2FN
         
     | 
| 
       53 
53 
     | 
    
         
             
            from transformers.modeling_utils import PreTrainedModel
         
     | 
| 
       54 
54 
     | 
    
         | 
| 
       55 
55 
     | 
    
         
             
            try:
         
     | 
| 
         @@ -596,6 +596,8 @@ class MoonVitPretrainedModel(PreTrainedModel): 
     | 
|
| 
       596 
596 
     | 
    
         
             
                _supports_sdpa = True
         
     | 
| 
       597 
597 
     | 
    
         | 
| 
       598 
598 
     | 
    
         
             
                def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
         
     | 
| 
      
 599 
     | 
    
         
            +
                    from transformers.activations import GELUTanh
         
     | 
| 
      
 600 
     | 
    
         
            +
             
     | 
| 
       599 
601 
     | 
    
         
             
                    super().__init__(config, *inputs, **kwargs)
         
     | 
| 
       600 
602 
     | 
    
         
             
                    config = deepcopy(config)
         
     | 
| 
       601 
603 
     | 
    
         
             
                    self.merge_kernel_size = config.merge_kernel_size
         
     | 
| 
         @@ -614,7 +616,7 @@ class MoonVitPretrainedModel(PreTrainedModel): 
     | 
|
| 
       614 
616 
     | 
    
         
             
                            "num_heads": config.num_attention_heads,
         
     | 
| 
       615 
617 
     | 
    
         
             
                            "hidden_dim": config.hidden_size,
         
     | 
| 
       616 
618 
     | 
    
         
             
                            "mlp_dim": config.intermediate_size,
         
     | 
| 
       617 
     | 
    
         
            -
                            "activation":  
     | 
| 
      
 619 
     | 
    
         
            +
                            "activation": GELUTanh(),
         
     | 
| 
       618 
620 
     | 
    
         
             
                            "attn_bias": True,
         
     | 
| 
       619 
621 
     | 
    
         
             
                            "attn_implementation": config._attn_implementation,
         
     | 
| 
       620 
622 
     | 
    
         
             
                        },
         
     | 
    
        sglang/srt/models/llama.py
    CHANGED
    
    | 
         @@ -45,13 +45,13 @@ from sglang.srt.layers.vocab_parallel_embedding import ( 
     | 
|
| 
       45 
45 
     | 
    
         
             
                ParallelLMHead,
         
     | 
| 
       46 
46 
     | 
    
         
             
                VocabParallelEmbedding,
         
     | 
| 
       47 
47 
     | 
    
         
             
            )
         
     | 
| 
       48 
     | 
    
         
            -
            from sglang.srt.managers.schedule_batch import global_server_args_dict
         
     | 
| 
       49 
48 
     | 
    
         
             
            from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
         
     | 
| 
       50 
49 
     | 
    
         
             
            from sglang.srt.model_loader.weight_utils import (
         
     | 
| 
       51 
50 
     | 
    
         
             
                default_weight_loader,
         
     | 
| 
       52 
51 
     | 
    
         
             
                kv_cache_scales_loader,
         
     | 
| 
       53 
52 
     | 
    
         
             
                maybe_remap_kv_scale_name,
         
     | 
| 
       54 
53 
     | 
    
         
             
            )
         
     | 
| 
      
 54 
     | 
    
         
            +
            from sglang.srt.server_args import get_global_server_args
         
     | 
| 
       55 
55 
     | 
    
         
             
            from sglang.srt.utils import add_prefix, make_layers
         
     | 
| 
       56 
56 
     | 
    
         
             
            from sglang.utils import get_exception_traceback
         
     | 
| 
       57 
57 
     | 
    
         | 
| 
         @@ -385,6 +385,10 @@ class LlamaModel(nn.Module): 
     | 
|
| 
       385 
385 
     | 
    
         
             
                                "Self attention has no KV cache scaling " "factor attribute!"
         
     | 
| 
       386 
386 
     | 
    
         
             
                            )
         
     | 
| 
       387 
387 
     | 
    
         | 
| 
      
 388 
     | 
    
         
            +
                def get_input_embeddings(self) -> nn.Embedding:
         
     | 
| 
      
 389 
     | 
    
         
            +
                    """Get input embeddings from the model."""
         
     | 
| 
      
 390 
     | 
    
         
            +
                    return self.embed_tokens
         
     | 
| 
      
 391 
     | 
    
         
            +
             
     | 
| 
       388 
392 
     | 
    
         | 
| 
       389 
393 
     | 
    
         
             
            class LlamaForCausalLM(nn.Module):
         
     | 
| 
       390 
394 
     | 
    
         
             
                # BitandBytes specific attributes
         
     | 
| 
         @@ -429,7 +433,7 @@ class LlamaForCausalLM(nn.Module): 
     | 
|
| 
       429 
433 
     | 
    
         
             
                            config.hidden_size,
         
     | 
| 
       430 
434 
     | 
    
         
             
                            quant_config=quant_config,
         
     | 
| 
       431 
435 
     | 
    
         
             
                            prefix=add_prefix("lm_head", prefix),
         
     | 
| 
       432 
     | 
    
         
            -
                            use_attn_tp_group= 
     | 
| 
      
 436 
     | 
    
         
            +
                            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
         
     | 
| 
       433 
437 
     | 
    
         
             
                        )
         
     | 
| 
       434 
438 
     | 
    
         
             
                    self.logits_processor = LogitsProcessor(config)
         
     | 
| 
       435 
439 
     | 
    
         
             
                    self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
         
     | 
| 
         @@ -27,7 +27,7 @@ from transformers import LlamaConfig 
     | 
|
| 
       27 
27 
     | 
    
         | 
| 
       28 
28 
     | 
    
         
             
            from sglang.srt.distributed import get_pp_group
         
     | 
| 
       29 
29 
     | 
    
         
             
            from sglang.srt.layers.layernorm import RMSNorm
         
     | 
| 
       30 
     | 
    
         
            -
            from sglang.srt.layers.linear import QKVParallelLinear 
     | 
| 
      
 30 
     | 
    
         
            +
            from sglang.srt.layers.linear import QKVParallelLinear
         
     | 
| 
       31 
31 
     | 
    
         
             
            from sglang.srt.layers.logits_processor import LogitsProcessor
         
     | 
| 
       32 
32 
     | 
    
         
             
            from sglang.srt.layers.quantization.base_config import QuantizationConfig
         
     | 
| 
       33 
33 
     | 
    
         
             
            from sglang.srt.layers.vocab_parallel_embedding import (
         
     |