PyPI - sglang - Versions diffs - 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (419) hide show

sglang/bench_one_batch.py +47 -28
sglang/bench_one_batch_server.py +41 -25
sglang/bench_serving.py +378 -160
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +10 -15
sglang/profiler.py +18 -1
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +13 -64
sglang/srt/configs/load_config.py +25 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +136 -25
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +0 -10
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +20 -11
sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +4 -2
sglang/srt/disaggregation/decode.py +123 -31
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +157 -19
sglang/srt/disaggregation/nixl/conn.py +69 -24
sglang/srt/disaggregation/prefill.py +96 -270
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +63 -19
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +83 -80
sglang/srt/entrypoints/grpc_server.py +430 -234
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +195 -102
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +225 -37
sglang/srt/entrypoints/openai/serving_base.py +49 -2
sglang/srt/entrypoints/openai/serving_chat.py +29 -74
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +15 -1
sglang/srt/entrypoints/openai/serving_responses.py +5 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +58 -6
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +33 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +20 -14
sglang/srt/function_call/glm4_moe_detector.py +1 -5
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/json_array_parser.py +0 -2
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/function_call/utils.py +2 -2
sglang/srt/grpc/compile_proto.py +3 -3
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
sglang/srt/layers/activation.py +10 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -1
sglang/srt/layers/attention/attention_registry.py +43 -23
sglang/srt/layers/attention/base_attn_backend.py +20 -1
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +24 -10
sglang/srt/layers/attention/flashinfer_backend.py +258 -22
sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +0 -1
sglang/srt/layers/attention/nsa_backend.py +404 -90
sglang/srt/layers/attention/triton_backend.py +208 -34
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
sglang/srt/layers/attention/utils.py +89 -7
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +12 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +17 -0
sglang/srt/layers/layernorm.py +64 -19
sglang/srt/layers/linear.py +9 -1
sglang/srt/layers/logits_processor.py +152 -17
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
sglang/srt/layers/moe/ep_moe/layer.py +154 -625
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
sglang/srt/layers/moe/moe_runner/runner.py +6 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +7 -6
sglang/srt/layers/moe/utils.py +20 -5
sglang/srt/layers/quantization/__init__.py +5 -58
sglang/srt/layers/quantization/awq.py +183 -9
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +27 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +152 -81
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +42 -14
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +125 -100
sglang/srt/layers/quantization/mxfp4.py +35 -68
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +23 -48
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +87 -20
sglang/srt/layers/quantization/w8a8_int8.py +30 -24
sglang/srt/layers/radix_attention.py +62 -9
sglang/srt/layers/rotary_embedding.py +686 -17
sglang/srt/layers/sampler.py +47 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +0 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/triton_backend.py +0 -1
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +24 -9
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +48 -17
sglang/srt/managers/data_parallel_controller.py +146 -42
sglang/srt/managers/detokenizer_manager.py +40 -13
sglang/srt/managers/io_struct.py +69 -16
sglang/srt/managers/mm_utils.py +20 -18
sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
sglang/srt/managers/overlap_utils.py +96 -19
sglang/srt/managers/schedule_batch.py +241 -511
sglang/srt/managers/schedule_policy.py +15 -2
sglang/srt/managers/scheduler.py +420 -514
sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
sglang/srt/managers/tokenizer_manager.py +375 -95
sglang/srt/managers/tp_worker.py +212 -161
sglang/srt/managers/utils.py +78 -2
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/allocator_ascend.py +2 -2
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +13 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +16 -1
sglang/srt/mem_cache/hicache_storage.py +11 -2
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +517 -219
sglang/srt/mem_cache/memory_pool_host.py +0 -1
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +53 -19
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +92 -26
sglang/srt/metrics/collector.py +31 -0
sglang/srt/metrics/func_timer.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +43 -5
sglang/srt/model_executor/forward_batch_info.py +71 -25
sglang/srt/model_executor/model_runner.py +362 -270
sglang/srt/model_executor/npu_graph_runner.py +2 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +424 -27
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +47 -28
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +13 -52
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +19 -3
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +418 -140
sglang/srt/models/dots_ocr.py +0 -2
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +13 -19
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/gemma3n_mm.py +1 -2
sglang/srt/models/glm4_moe.py +327 -382
sglang/srt/models/glm4_moe_nextn.py +6 -16
sglang/srt/models/glm4v.py +2 -1
sglang/srt/models/glm4v_moe.py +32 -199
sglang/srt/models/gpt_oss.py +5 -5
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +5 -22
sglang/srt/models/longcat_flash_nextn.py +3 -14
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +13 -3
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen2_5_vl.py +3 -3
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +15 -12
sglang/srt/models/qwen2_vl.py +5 -2
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +19 -37
sglang/srt/models/qwen3_next.py +7 -12
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +37 -33
sglang/srt/models/qwen3_vl_moe.py +57 -185
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +0 -1
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/utils.py +11 -1
sglang/srt/multimodal/processors/base_processor.py +7 -2
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +0 -1
sglang/srt/multimodal/processors/glm4v.py +2 -6
sglang/srt/multimodal/processors/internvl.py +0 -2
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +75 -16
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/reasoning_parser.py +28 -2
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +17 -22
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +846 -163
sglang/srt/server_args_config_parser.py +1 -1
sglang/srt/single_batch_overlap.py +36 -31
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
sglang/srt/speculative/eagle_info.py +57 -18
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +83 -280
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/spec_utils.py +38 -3
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +28 -14
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +272 -82
sglang/srt/utils/hf_transformers_utils.py +44 -17
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +41 -0
sglang/test/runners.py +2 -0
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/test_block_fp8.py +1 -2
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +463 -107
sglang/test/test_deterministic_utils.py +74 -0
sglang/test/test_disaggregation_utils.py +81 -0
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_utils.py +85 -20
sglang/version.py +1 -1
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/srt/models/vila.py +0 -306
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -4,18 +4,40 @@ from __future__ import annotations
 import enum
 import logging
+import re
 from enum import Enum
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING
+try:
+    from sgl_kernel import fused_marlin_moe
+    FUSED_MARLIN_MOE_AVAILABLE = True
+except ImportError:
+    FUSED_MARLIN_MOE_AVAILABLE = False
+try:
+    from kt_kernel import AMXMoEWrapper
+    KTRANSFORMERS_AVAILABLE = True
+except ImportError:
+    KTRANSFORMERS_AVAILABLE = False
 import torch
 from compressed_tensors import CompressionFormat
 from compressed_tensors.quantization import QuantizationStrategy
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.environ import envs
 from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
 from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
 from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+    WNA16_SUPPORTED_BITS,
+)
 from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz, scaled_fp8_quant
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
+from sglang.srt.layers.quantization.gptq import gptq_marlin_moe_repack
+from sglang.srt.layers.quantization.marlin_utils import marlin_moe_permute_scales
 from sglang.srt.layers.quantization.utils import (
     all_close_1d,
     per_tensor_dequantize,
@@ -23,10 +45,9 @@ from sglang.srt.layers.quantization.utils import (
 )
 from sglang.srt.utils import (
     get_bool_env_var,
-    is_cpu,
+    get_compiler_backend,
     is_cuda,
     is_hip,
-    is_npu,
     set_weight_attrs,
 )
@@ -41,6 +62,8 @@ if TYPE_CHECKING:
     )
 _is_hip = is_hip()
+_is_cuda = is_cuda()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _use_aiter:
@@ -48,16 +71,25 @@ if _use_aiter:
     from sglang.srt.layers.moe.rocm_moe_utils import rocm_fused_experts_tkw1
-try:
-    import vllm
-    VLLM_AVAILABLE = True
-except ImportError:
-    VLLM_AVAILABLE = False
+if _is_cuda:
+    from sgl_kernel import fused_marlin_moe
 logger = logging.getLogger(__name__)
+def _mask_topk_ids_cpu_experts(topk_ids: torch.Tensor, num_gpu_experts: int):
+    """Mask topk_ids >= num_gpu_experts by setting them to -1."""
+    topk_ids[topk_ids >= num_gpu_experts] = -1
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def mask_cpu_expert_ids(topk_ids: torch.Tensor, num_gpu_experts: int):
+    """mask CPU expert IDs."""
+    _mask_topk_ids_cpu_experts(topk_ids, num_gpu_experts)
+    return topk_ids
 class GPTQMarlinState(Enum):
     REPACK = enum.auto()
     READY = enum.auto()
@@ -67,6 +99,7 @@ __all__ = [
     "CompressedTensorsMoEMethod",
     "CompressedTensorsW8A8Fp8MoEMethod",
     "CompressedTensorsWNA16MoEMethod",
+    "CompressedTensorsWNA16AMXEPMoEMethod",  # for Ktransformers
 ]
@@ -79,17 +112,27 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     @staticmethod
     def get_moe_method(
         quant_config: CompressedTensorsConfig,
+        layer: torch.nn.Module,
+        prefix: str,
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
+        if envs.SGLANG_KT_MOE_AMX_WEIGHT_PATH.is_set():
+            match = re.search(r"(\d+)\.mlp", prefix)
+            if not match:
+                raise ValueError(
+                    f"Unable to extract layer number from prefix '{prefix}'. "
+                    f"Expected format: '<layer_number>.mlp'"
+                )
+            layer_number = int(match.group(1))
+            return CompressedTensorsWNA16AMXEPMoEMethod(quant_config, layer_number)
         weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
         input_quant = quant_config.target_scheme_map["Linear"].get("input_activations")
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
-            if not VLLM_AVAILABLE:
-                raise ImportError(
-                    "vllm is not installed, to use CompressedTensorsWNA16MoEMethod, please install vllm."
-                )
+            logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
             return CompressedTensorsWNA16MoEMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
@@ -208,7 +251,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             layer.w13_input_scale = None
             layer.w2_input_scale = None
-    def process_weights_after_loading(self, layer: FusedMoE) -> None:
+    def process_weights_after_loading(self, layer: torch.nn.Module | FusedMoE) -> None:
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
         if self.static_input_scales:
@@ -356,7 +399,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
-    def __init__(self, quant_config: CompressedTensorsConfig):
+    def __init__(self, quant_config: CompressedTensorsConfig, num_gpu_experts=-1):
         self.quant_config = quant_config
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -378,6 +421,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
                 "is supported for the following bits: ",
                 f"{WNA16_SUPPORTED_BITS}",
             )
+        self.num_gpu_experts = num_gpu_experts
     def create_weights(
         self,
@@ -388,10 +432,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-        assert (
-            params_dtype == torch.float16
-        ), "float16 is required for MoE compressed models. Set dtype=torch.float16"  # noqa: E501
+        if self.num_gpu_experts != -1:
+            num_experts = self.num_gpu_experts
         # Will transpose the loaded weight along the
         # intermediate and hidden dim sizes. Will
@@ -530,44 +572,6 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             getattr(layer, name).copy_(new_t)
             del new_t
-        def get_scale_perms(num_bits: int):
-            scale_perm: List[int] = []
-            for i in range(8):
-                scale_perm.extend([i + 8 * j for j in range(8)])
-            scale_perm_single: List[int] = []
-            for i in range(4):
-                scale_perm_single.extend(
-                    [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]
-                )
-            return scale_perm, scale_perm_single
-        def marlin_permute_scales(
-            s: torch.Tensor, size_k: int, size_n: int, group_size: int, num_bits: int
-        ):
-            scale_perm, scale_perm_single = get_scale_perms(num_bits)
-            if group_size < size_k and group_size != -1:
-                s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
-            else:
-                s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
-            s = s.reshape((-1, size_n)).contiguous()
-            return s
-        def marlin_moe_permute_scales(
-            s: torch.Tensor, size_k: int, size_n: int, group_size: int, num_bits: int
-        ):
-            num_experts = s.shape[0]
-            output = torch.empty(
-                (num_experts, s.shape[1], s.shape[2]), device=s.device, dtype=s.dtype
-            )
-            for e in range(num_experts):
-                output[e] = marlin_permute_scales(
-                    s[e], size_k, size_n, group_size, num_bits
-                )
-            return output
-        size_k2 = layer.w2_weight_packed.shape[2]
-        size_k13 = layer.w13_weight_packed.shape[2]
         num_experts = layer.w13_weight_g_idx.shape[0]
         device = layer.w13_weight_g_idx.device
@@ -614,42 +618,39 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
                 requires_grad=False,
             )
-        from vllm import _custom_ops as vllm_ops
-        marlin_w13_qweight = vllm_ops.gptq_marlin_moe_repack(
+        marlin_w13_qweight = gptq_marlin_moe_repack(
             layer.w13_weight_packed,
             layer.w13_g_idx_sort_indices,
             layer.w13_weight_packed.shape[1] * self.packed_factor,
             layer.w13_weight_packed.shape[2],
             self.num_bits,
         )
-        replace_tensor("w13_weight_packed", marlin_w13_qweight)
-        marlin_w2_qweight = vllm_ops.gptq_marlin_moe_repack(
+        replace_parameter(layer, "w13_weight_packed", marlin_w13_qweight)
+        marlin_w2_qweight = gptq_marlin_moe_repack(
             layer.w2_weight_packed,
             layer.w2_g_idx_sort_indices,
             layer.w2_weight_packed.shape[1] * self.packed_factor,
             layer.w2_weight_packed.shape[2],
             self.num_bits,
         )
-        replace_tensor("w2_weight_packed", marlin_w2_qweight)
+        replace_parameter(layer, "w2_weight_packed", marlin_w2_qweight)
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             layer.w13_weight_scale,
-            size_k13,
+            layer.w13_weight_packed.shape[2],
             layer.w13_weight_scale.shape[2],
             self.group_size,
-            self.num_bits,
         )
-        replace_tensor("w13_weight_scale", marlin_w13_scales)
+        replace_parameter(layer, "w13_weight_scale", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             layer.w2_weight_scale,
             layer.w2_weight_scale.shape[1]
             * (self.group_size if self.group_size != -1 else self.packed_factor),
-            size_k2,
+            layer.w2_weight_scale.shape[2],
             self.group_size,
-            self.num_bits,
         )
-        replace_tensor("w2_weight_scale", marlin_w2_scales)
+        replace_parameter(layer, "w2_weight_scale", marlin_w2_scales)
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
@@ -673,7 +674,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         topk_weights, topk_ids, router_logits = topk_output
-        output = torch.ops.vllm.fused_marlin_moe(
+        output = fused_marlin_moe(
             x,
             layer.w13_weight_packed,
             layer.w2_weight_packed,
@@ -690,3 +691,353 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
             is_k_full=self.is_k_full,
         )
         return StandardCombineInput(hidden_states=output)
+class CompressedTensorsWNA16AMXMoEMethod(CompressedTensorsMoEMethod):
+    """AMX MoE method using AMXMoEWrapper for CPU inference."""
+    def __init__(
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        layer_idx,
+        num_gpu_experts,
+        cpuinfer,
+        threadpool_count,
+        amx_weight_path,
+        chunked_prefill_size,
+    ):
+        if not KTRANSFORMERS_AVAILABLE:
+            raise ImportError(
+                "kt_kernel is not installed, to use CompressedTensorsWNA16AMXEPMoEMethod, please install kt_kernel."
+            )
+        if not FUSED_MARLIN_MOE_AVAILABLE:
+            raise ImportError("fused_marlin_moe is not available")
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.layer_idx = layer_idx
+        self.num_gpu_experts = num_gpu_experts
+        self.amx_weight_path = amx_weight_path
+        self.chunked_prefill_size = chunked_prefill_size
+        self.cpuinfer = cpuinfer
+        self.threadpool_count = threadpool_count
+        self.amx_wrapper = None
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        self.experts_num = num_experts
+        self.num_experts_per_tok = extra_weight_attrs.pop("top_k")
+        self.hidden_size = hidden_size
+        self.moe_intermediate_size = extra_weight_attrs.pop("intermediate_size_full")
+        if self.tp_rank != 0:
+            return
+        self.amx_wrapper = AMXMoEWrapper(
+            layer_idx=self.layer_idx,
+            num_experts=num_experts,
+            num_experts_per_tok=self.num_experts_per_tok,
+            hidden_size=hidden_size,
+            moe_intermediate_size=self.moe_intermediate_size,
+            num_gpu_experts=self.num_gpu_experts,
+            cpuinfer_threads=self.cpuinfer,
+            threadpool_count=self.threadpool_count,
+            amx_weight_path=self.amx_weight_path,
+            chunked_prefill_size=self.chunked_prefill_size,
+        )
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if self.tp_rank != 0:
+            return
+        if self.amx_wrapper is None:
+            raise RuntimeError(
+                "AMXMoEWrapper not initialized. Call create_weights first."
+            )
+        torch.cuda.synchronize()
+        # Load weights using wrapper
+        from sglang.srt.eplb.expert_location_dispatch import (
+            get_global_expert_location_metadata,
+        )
+        physical_to_logical_map_cpu = (
+            get_global_expert_location_metadata()
+            .physical_to_logical_map_cpu[self.layer_idx]
+            .contiguous()
+        )
+        self.amx_wrapper.load_weights(physical_to_logical_map_cpu)
+    def submit(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> None:
+        """Submit AMX inference task asynchronously."""
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        topk_weights, topk_ids, _ = topk_output
+        if self.tp_rank != 0 or self.amx_wrapper is None:
+            return None
+        # Submit forward task using wrapper
+        self.amx_wrapper.submit_forward(
+            x, topk_ids, topk_weights, torch.cuda.current_stream(x.device).cuda_stream
+        )
+        return None
+    def sync(self, x):
+        """Synchronize and retrieve AMX inference results."""
+        if self.tp_rank != 0 or self.amx_wrapper is None:
+            return torch.zeros_like(x)
+        # Sync forward task using wrapper
+        return self.amx_wrapper.sync_forward(
+            x, torch.cuda.current_stream(x.device).cuda_stream
+        )
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        """Execute AMX MoE forward pass synchronously."""
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        topk_weights, topk_ids, _ = topk_output
+        if self.tp_rank != 0 or self.amx_wrapper is None:
+            return StandardCombineInput(hidden_states=torch.zeros_like(x))
+        # Execute forward using wrapper (submit + sync)
+        output = self.amx_wrapper.forward(
+            x, topk_ids, topk_weights, torch.cuda.current_stream(x.device).cuda_stream
+        )
+        return StandardCombineInput(hidden_states=output)
+def override_config(
+    cls,
+    num_gpu_experts,
+    cpuinfer,
+    threadpool_count,
+    amx_weight_path,
+    amx_method,
+    chunked_prefill_size,
+):
+    """Override MOE configuration via environment variables."""
+    # Set environment variables using envs utility class
+    if num_gpu_experts is not None:
+        envs.SGLANG_KT_MOE_NUM_GPU_EXPERTS.set(num_gpu_experts)
+    if cpuinfer is not None:
+        envs.SGLANG_KT_MOE_CPUINFER.set(cpuinfer)
+    if threadpool_count is not None:
+        envs.SGLANG_KT_THREADPOOL_COUNT.set(threadpool_count)
+    if amx_weight_path is not None:
+        envs.SGLANG_KT_MOE_AMX_WEIGHT_PATH.set(amx_weight_path)
+    if amx_method is not None:
+        envs.SGLANG_KT_AMX_METHOD.set(amx_method)
+    if chunked_prefill_size is not None:
+        envs.SGLANG_KT_MOE_CHUNKED_PREFILL_SIZE.set(chunked_prefill_size)
+class CompressedTensorsWNA16AMXEPMoEMethod(CompressedTensorsMoEMethod):
+    def __init__(
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        layer_idx,
+    ):
+        self.tp_rank = get_tensor_model_parallel_rank()
+        if (
+            not envs.SGLANG_KT_MOE_NUM_GPU_EXPERTS.is_set()
+            or not envs.SGLANG_KT_MOE_CPUINFER.is_set()
+            or not envs.SGLANG_KT_MOE_AMX_WEIGHT_PATH.is_set()
+        ):
+            raise RuntimeError(
+                "the following arguments are required: --kt-amx-weight-path, --kt-cpuinfer, --kt-num-gpu-experts"
+            )
+        self.num_gpu_experts = envs.SGLANG_KT_MOE_NUM_GPU_EXPERTS.value
+        cpuinfer = envs.SGLANG_KT_MOE_CPUINFER.value
+        threadpool_count = envs.SGLANG_KT_THREADPOOL_COUNT.value
+        amx_weight_path = envs.SGLANG_KT_MOE_AMX_WEIGHT_PATH.value
+        chunked_prefill_size = envs.SGLANG_KT_MOE_CHUNKED_PREFILL_SIZE.value
+        self.AMX_method = CompressedTensorsWNA16AMXMoEMethod(
+            quant_config,
+            layer_idx,
+            self.num_gpu_experts,
+            cpuinfer,
+            threadpool_count,
+            amx_weight_path,
+            chunked_prefill_size,
+        )
+        self.marlin_method = CompressedTensorsWNA16MoEMethod(
+            quant_config, self.num_gpu_experts
+        )
+        self.layer_id = layer_idx
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        self.global_num_experts = num_experts
+        self.AMX_method.create_weights(
+            layer,
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            params_dtype,
+            **extra_weight_attrs,
+        )
+        self.marlin_method.create_weights(
+            layer,
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            params_dtype,
+            **extra_weight_attrs,
+        )
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.AMX_method.process_weights_after_loading(layer)
+        self.marlin_method.process_weights_after_loading(layer)
+    def submit(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        """Submit hybrid GPU+CPU MoE task (AMX submission + GPU execution)."""
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        topk_weights, topk_ids, router_logits = topk_output
+        # Submit AMX task if on rank 0
+        if self.tp_rank == 0:
+            self.AMX_method.submit(layer, dispatch_output)
+        # Mask CPU expert IDs (>= num_gpu_experts) as -1 so they won't be computed on GPU
+        topk_ids = mask_cpu_expert_ids(topk_ids, self.num_gpu_experts)
+        # Execute GPU (Marlin) experts
+        output = fused_marlin_moe(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            g_idx1=layer.w13_weight_g_idx,
+            g_idx2=layer.w2_weight_g_idx,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
+            num_bits=self.marlin_method.num_bits,
+            is_k_full=self.marlin_method.is_k_full,
+            global_num_experts=self.global_num_experts,
+            expert_map=torch.empty(1, device=x.device),
+        )
+        return StandardCombineInput(hidden_states=output)
+    def sync(self, x):
+        """Synchronize and retrieve AMX results."""
+        if self.tp_rank != 0:
+            return torch.zeros_like(x)
+        return self.AMX_method.sync(x)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        """Execute hybrid GPU+CPU MoE forward pass with parallelism."""
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        topk_weights, topk_ids, router_logits = topk_output
+        # Step 1: Submit AMX task (non-blocking) if on rank 0
+        # This starts CPU computation in parallel
+        if self.tp_rank == 0:
+            self.AMX_method.submit(layer, dispatch_output)
+        # Step 2: Execute GPU (Marlin) experts in parallel with CPU
+        # Mask CPU expert IDs (>= num_gpu_experts) as -1 so they won't be computed on GPU
+        topk_ids = mask_cpu_expert_ids(topk_ids, self.num_gpu_experts)
+        # While GPU computes, CPU is also computing
+        output = fused_marlin_moe(
+            x,
+            layer.w13_weight_packed,
+            layer.w2_weight_packed,
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            g_idx1=layer.w13_weight_g_idx,
+            g_idx2=layer.w2_weight_g_idx,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
+            num_bits=self.marlin_method.num_bits,
+            is_k_full=self.marlin_method.is_k_full,
+            global_num_experts=self.global_num_experts,
+            expert_map=torch.empty(1, device=x.device),
+        )
+        # Step 3: Sync AMX results and combine with GPU results
+        if self.tp_rank == 0:
+            amx_output = self.AMX_method.sync(x)
+            output += amx_output
+        return StandardCombineInput(hidden_states=output)
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.AMX_method.create_moe_runner(layer, moe_runner_config)

sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py CHANGED Viewed

@@ -4,10 +4,13 @@ from .compressed_tensors_scheme import CompressedTensorsScheme
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
 from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
+from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16
 __all__ = [
     "CompressedTensorsScheme",
     "CompressedTensorsW8A8Fp8",
     "CompressedTensorsW8A16Fp8",
     "CompressedTensorsW8A8Int8",
+    "CompressedTensorsWNA16",
+    "WNA16_SUPPORTED_BITS",
 ]

sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py CHANGED Viewed

@@ -14,25 +14,12 @@ from sglang.srt.layers.parameter import (
 from sglang.srt.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme,
 )
+from sglang.srt.layers.quantization.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear,
+    prepare_fp8_layer_for_marlin,
+)
 from sglang.srt.layers.quantization.utils import convert_to_channelwise
-try:
-    from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
-        apply_fp8_marlin_linear,
-        prepare_fp8_layer_for_marlin,
-    )
-    MARLIN_FP8_AVAILABLE = True
-except ImportError:
-    MARLIN_FP8_AVAILABLE = False
-    def apply_fp8_marlin_linear(*args, **kwargs):
-        raise ImportError("vllm is not installed")
-    def prepare_fp8_layer_for_marlin(*args, **kwargs):
-        raise ImportError("vllm is not installed")
 __all__ = ["CompressedTensorsW8A16Fp8"]
 SUPPORTED_STRATEGIES = [QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR]
@@ -43,11 +30,6 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
-        if not MARLIN_FP8_AVAILABLE:
-            raise ImportError(
-                "vllm is not installed. To use CompressedTensorsW8A16Fp8, please install vllm"
-            )
     @classmethod
     def get_min_capability(cls) -> int:
         # ampere and up

sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl