sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
- sglang/bench_one_batch_server.py +340 -34
- sglang/bench_serving.py +340 -159
- sglang/check_env.py +1 -1
- sglang/compile_deep_gemm.py +6 -2
- sglang/global_config.py +1 -25
- sglang/lang/api.py +6 -0
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +13 -0
- sglang/launch_server.py +9 -2
- sglang/profiler.py +20 -3
- sglang/srt/_custom_ops.py +1 -1
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
- sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
- sglang/srt/compilation/backend.py +437 -0
- sglang/srt/compilation/compilation_config.py +20 -0
- sglang/srt/compilation/compilation_counter.py +47 -0
- sglang/srt/compilation/compile.py +210 -0
- sglang/srt/compilation/compiler_interface.py +503 -0
- sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
- sglang/srt/compilation/fix_functionalization.py +134 -0
- sglang/srt/compilation/fx_utils.py +83 -0
- sglang/srt/compilation/inductor_pass.py +140 -0
- sglang/srt/compilation/pass_manager.py +66 -0
- sglang/srt/compilation/piecewise_context_manager.py +40 -0
- sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/deepseek_ocr.py +262 -0
- sglang/srt/configs/deepseekvl2.py +194 -96
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +2 -7
- sglang/srt/configs/falcon_h1.py +309 -0
- sglang/srt/configs/load_config.py +33 -2
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +284 -118
- sglang/srt/configs/modelopt_config.py +30 -0
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/olmo3.py +105 -0
- sglang/srt/configs/points_v15_chat.py +29 -0
- sglang/srt/configs/qwen3_next.py +11 -47
- sglang/srt/configs/qwen3_omni.py +613 -0
- sglang/srt/configs/qwen3_vl.py +576 -0
- sglang/srt/connector/remote_instance.py +1 -1
- sglang/srt/constrained/base_grammar_backend.py +6 -1
- sglang/srt/constrained/llguidance_backend.py +5 -0
- sglang/srt/constrained/outlines_backend.py +1 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
- sglang/srt/constrained/utils.py +12 -0
- sglang/srt/constrained/xgrammar_backend.py +26 -15
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
- sglang/srt/disaggregation/base/conn.py +17 -4
- sglang/srt/disaggregation/common/conn.py +268 -98
- sglang/srt/disaggregation/decode.py +172 -39
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/fake/conn.py +11 -3
- sglang/srt/disaggregation/mooncake/conn.py +203 -555
- sglang/srt/disaggregation/nixl/conn.py +217 -63
- sglang/srt/disaggregation/prefill.py +113 -270
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
- sglang/srt/distributed/device_communicators/pynccl.py +24 -12
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/naive_distributed.py +5 -4
- sglang/srt/distributed/parallel_state.py +203 -97
- sglang/srt/elastic_ep/elastic_ep.py +74 -0
- sglang/srt/entrypoints/context.py +3 -2
- sglang/srt/entrypoints/engine.py +85 -65
- sglang/srt/entrypoints/grpc_server.py +632 -305
- sglang/srt/entrypoints/harmony_utils.py +2 -2
- sglang/srt/entrypoints/http_server.py +169 -17
- sglang/srt/entrypoints/http_server_engine.py +1 -7
- sglang/srt/entrypoints/openai/protocol.py +327 -34
- sglang/srt/entrypoints/openai/serving_base.py +74 -8
- sglang/srt/entrypoints/openai/serving_chat.py +202 -118
- sglang/srt/entrypoints/openai/serving_classify.py +204 -0
- sglang/srt/entrypoints/openai/serving_completions.py +20 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +47 -2
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +323 -0
- sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
- sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
- sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
- sglang/srt/eplb/expert_distribution.py +3 -4
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/eplb/expert_location_dispatch.py +2 -2
- sglang/srt/eplb/expert_location_updater.py +2 -2
- sglang/srt/function_call/base_format_detector.py +17 -18
- sglang/srt/function_call/function_call_parser.py +21 -16
- sglang/srt/function_call/glm4_moe_detector.py +4 -8
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +61 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +98 -7
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/grpc_request_manager.py +915 -0
- sglang/srt/grpc/health_servicer.py +189 -0
- sglang/srt/grpc/scheduler_launcher.py +181 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
- sglang/srt/layers/activation.py +11 -7
- sglang/srt/layers/attention/aiter_backend.py +17 -18
- sglang/srt/layers/attention/ascend_backend.py +125 -10
- sglang/srt/layers/attention/attention_registry.py +226 -0
- sglang/srt/layers/attention/base_attn_backend.py +32 -4
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +0 -1
- sglang/srt/layers/attention/fla/chunk_o.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/fla/index.py +0 -2
- sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
- sglang/srt/layers/attention/fla/utils.py +0 -3
- sglang/srt/layers/attention/fla/wy_fast.py +0 -2
- sglang/srt/layers/attention/flashattention_backend.py +52 -15
- sglang/srt/layers/attention/flashinfer_backend.py +357 -212
- sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
- sglang/srt/layers/attention/flashmla_backend.py +9 -7
- sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
- sglang/srt/layers/attention/intel_amx_backend.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
- sglang/srt/layers/attention/mamba/mamba.py +514 -1
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
- sglang/srt/layers/attention/nsa/utils.py +23 -0
- sglang/srt/layers/attention/nsa_backend.py +1201 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +249 -42
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
- sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
- sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
- sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
- sglang/srt/layers/attention/utils.py +11 -7
- sglang/srt/layers/attention/vision.py +61 -3
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/xpu_backend.py +1028 -0
- sglang/srt/layers/communicator.py +19 -7
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
- sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
- sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
- sglang/srt/layers/dp_attention.py +28 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +47 -15
- sglang/srt/layers/linear.py +30 -5
- sglang/srt/layers/logits_processor.py +161 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_moe.py +0 -2
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
- sglang/srt/layers/moe/ep_moe/layer.py +243 -448
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
- sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/moe_runner/triton.py +3 -1
- sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
- sglang/srt/layers/moe/router.py +51 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
- sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
- sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
- sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
- sglang/srt/layers/moe/topk.py +3 -2
- sglang/srt/layers/moe/utils.py +27 -1
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +2 -53
- sglang/srt/layers/quantization/awq.py +183 -6
- sglang/srt/layers/quantization/awq_triton.py +29 -0
- sglang/srt/layers/quantization/base_config.py +20 -1
- sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
- sglang/srt/layers/quantization/fp8.py +86 -20
- sglang/srt/layers/quantization/fp8_kernel.py +55 -10
- sglang/srt/layers/quantization/fp8_utils.py +43 -15
- sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
- sglang/srt/layers/quantization/gptq.py +0 -1
- sglang/srt/layers/quantization/int8_kernel.py +18 -2
- sglang/srt/layers/quantization/marlin_utils.py +12 -0
- sglang/srt/layers/quantization/modelopt_quant.py +141 -81
- sglang/srt/layers/quantization/mxfp4.py +17 -34
- sglang/srt/layers/quantization/petit.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
- sglang/srt/layers/quantization/unquant.py +1 -4
- sglang/srt/layers/quantization/utils.py +0 -1
- sglang/srt/layers/quantization/w4afp8.py +51 -24
- sglang/srt/layers/quantization/w8a8_int8.py +45 -27
- sglang/srt/layers/radix_attention.py +59 -9
- sglang/srt/layers/rotary_embedding.py +750 -46
- sglang/srt/layers/sampler.py +84 -16
- sglang/srt/layers/sparse_pooler.py +98 -0
- sglang/srt/layers/utils.py +23 -1
- sglang/srt/layers/vocab_parallel_embedding.py +4 -1
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +9 -4
- sglang/srt/lora/eviction_policy.py +139 -0
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +33 -7
- sglang/srt/lora/lora_registry.py +1 -1
- sglang/srt/lora/mem_pool.py +41 -17
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +83 -152
- sglang/srt/managers/data_parallel_controller.py +156 -87
- sglang/srt/managers/detokenizer_manager.py +51 -24
- sglang/srt/managers/io_struct.py +223 -129
- sglang/srt/managers/mm_utils.py +49 -10
- sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +130 -0
- sglang/srt/managers/schedule_batch.py +340 -529
- sglang/srt/managers/schedule_policy.py +158 -18
- sglang/srt/managers/scheduler.py +665 -620
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
- sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
- sglang/srt/managers/scheduler_pp_mixin.py +341 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
- sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
- sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
- sglang/srt/managers/tokenizer_manager.py +462 -226
- sglang/srt/managers/tp_worker.py +217 -156
- sglang/srt/managers/utils.py +79 -47
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +42 -28
- sglang/srt/mem_cache/base_prefix_cache.py +3 -3
- sglang/srt/mem_cache/chunk_cache.py +20 -2
- sglang/srt/mem_cache/common.py +480 -0
- sglang/srt/mem_cache/evict_policy.py +38 -0
- sglang/srt/mem_cache/hicache_storage.py +44 -2
- sglang/srt/mem_cache/hiradix_cache.py +134 -34
- sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
- sglang/srt/mem_cache/memory_pool.py +602 -208
- sglang/srt/mem_cache/memory_pool_host.py +134 -183
- sglang/srt/mem_cache/multimodal_cache.py +0 -1
- sglang/srt/mem_cache/radix_cache.py +263 -78
- sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
- sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
- sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
- sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
- sglang/srt/mem_cache/swa_radix_cache.py +115 -58
- sglang/srt/metrics/collector.py +113 -120
- sglang/srt/metrics/func_timer.py +3 -8
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +81 -36
- sglang/srt/model_executor/forward_batch_info.py +40 -50
- sglang/srt/model_executor/model_runner.py +507 -319
- sglang/srt/model_executor/npu_graph_runner.py +11 -5
- sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +438 -37
- sglang/srt/model_loader/utils.py +0 -1
- sglang/srt/model_loader/weight_utils.py +200 -27
- sglang/srt/models/apertus.py +2 -3
- sglang/srt/models/arcee.py +2 -2
- sglang/srt/models/bailing_moe.py +40 -56
- sglang/srt/models/bailing_moe_nextn.py +3 -4
- sglang/srt/models/bert.py +1 -1
- sglang/srt/models/deepseek_nextn.py +25 -4
- sglang/srt/models/deepseek_ocr.py +1516 -0
- sglang/srt/models/deepseek_v2.py +793 -235
- sglang/srt/models/dots_ocr.py +171 -0
- sglang/srt/models/dots_vlm.py +0 -1
- sglang/srt/models/dots_vlm_vit.py +1 -1
- sglang/srt/models/falcon_h1.py +570 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -3
- sglang/srt/models/glm4_moe.py +17 -40
- sglang/srt/models/glm4_moe_nextn.py +4 -4
- sglang/srt/models/glm4v.py +3 -2
- sglang/srt/models/glm4v_moe.py +6 -6
- sglang/srt/models/gpt_oss.py +12 -35
- sglang/srt/models/grok.py +10 -23
- sglang/srt/models/hunyuan.py +2 -7
- sglang/srt/models/interns1.py +0 -1
- sglang/srt/models/kimi_vl.py +1 -7
- sglang/srt/models/kimi_vl_moonvit.py +4 -2
- sglang/srt/models/llama.py +6 -2
- sglang/srt/models/llama_eagle3.py +1 -1
- sglang/srt/models/longcat_flash.py +6 -23
- sglang/srt/models/longcat_flash_nextn.py +4 -15
- sglang/srt/models/mimo.py +2 -13
- sglang/srt/models/mimo_mtp.py +1 -2
- sglang/srt/models/minicpmo.py +7 -5
- sglang/srt/models/mixtral.py +1 -4
- sglang/srt/models/mllama.py +1 -1
- sglang/srt/models/mllama4.py +27 -6
- sglang/srt/models/nemotron_h.py +511 -0
- sglang/srt/models/olmo2.py +31 -4
- sglang/srt/models/opt.py +5 -5
- sglang/srt/models/phi.py +1 -1
- sglang/srt/models/phi4mm.py +1 -1
- sglang/srt/models/phimoe.py +0 -1
- sglang/srt/models/pixtral.py +0 -3
- sglang/srt/models/points_v15_chat.py +186 -0
- sglang/srt/models/qwen.py +0 -1
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +5 -5
- sglang/srt/models/qwen2_audio.py +2 -15
- sglang/srt/models/qwen2_moe.py +70 -4
- sglang/srt/models/qwen2_vl.py +6 -3
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +50 -38
- sglang/srt/models/qwen3_next.py +43 -21
- sglang/srt/models/qwen3_next_mtp.py +3 -4
- sglang/srt/models/qwen3_omni_moe.py +661 -0
- sglang/srt/models/qwen3_vl.py +791 -0
- sglang/srt/models/qwen3_vl_moe.py +343 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/roberta.py +55 -3
- sglang/srt/models/sarashina2_vision.py +268 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +3 -5
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +61 -0
- sglang/srt/multimodal/processors/base_processor.py +21 -9
- sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
- sglang/srt/multimodal/processors/dots_vlm.py +2 -4
- sglang/srt/multimodal/processors/glm4v.py +1 -5
- sglang/srt/multimodal/processors/internvl.py +20 -10
- sglang/srt/multimodal/processors/janus_pro.py +0 -1
- sglang/srt/multimodal/processors/mllama4.py +0 -8
- sglang/srt/multimodal/processors/phi4mm.py +0 -1
- sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
- sglang/srt/multimodal/processors/qwen_vl.py +83 -17
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/multimodal/processors/step3_vl.py +1 -1
- sglang/srt/parser/conversation.py +41 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/parser/reasoning_parser.py +0 -1
- sglang/srt/sampling/custom_logit_processor.py +77 -2
- sglang/srt/sampling/sampling_batch_info.py +36 -23
- sglang/srt/sampling/sampling_params.py +75 -0
- sglang/srt/server_args.py +1300 -338
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +161 -0
- sglang/srt/speculative/base_spec_worker.py +34 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/draft_utils.py +226 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
- sglang/srt/speculative/eagle_info.py +786 -0
- sglang/srt/speculative/eagle_info_v2.py +458 -0
- sglang/srt/speculative/eagle_utils.py +113 -1270
- sglang/srt/speculative/eagle_worker.py +120 -285
- sglang/srt/speculative/eagle_worker_v2.py +702 -0
- sglang/srt/speculative/ngram_info.py +433 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +49 -0
- sglang/srt/speculative/spec_utils.py +641 -0
- sglang/srt/speculative/standalone_worker.py +4 -14
- sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +35 -18
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
- sglang/srt/{utils.py → utils/common.py} +583 -113
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
- sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
- sglang/srt/{offloader.py → utils/offloader.py} +4 -4
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/profile_merger.py +199 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_flashattn_backend.py +1 -1
- sglang/test/attention/test_flashattn_mla_backend.py +0 -1
- sglang/test/attention/test_prefix_chunk_info.py +0 -2
- sglang/test/attention/test_trtllm_mla_backend.py +221 -53
- sglang/test/few_shot_gsm8k_engine.py +2 -4
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/kit_matched_stop.py +157 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +120 -11
- sglang/test/runners.py +3 -1
- sglang/test/send_one.py +42 -7
- sglang/test/simple_eval_common.py +8 -2
- sglang/test/simple_eval_gpqa.py +0 -1
- sglang/test/simple_eval_humaneval.py +0 -3
- sglang/test/simple_eval_longbench_v2.py +344 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +3 -4
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
- sglang/test/test_cutlass_moe.py +1 -2
- sglang/test/test_cutlass_w4a8_moe.py +10 -20
- sglang/test/test_deterministic.py +430 -0
- sglang/test/test_deterministic_utils.py +73 -0
- sglang/test/test_disaggregation_utils.py +93 -1
- sglang/test/test_marlin_moe.py +0 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +432 -16
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
- sglang/srt/entrypoints/grpc_request_manager.py +0 -580
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/speculative/build_eagle_tree.py +0 -427
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -17,21 +17,22 @@ import logging
|
|
|
17
17
|
import math
|
|
18
18
|
import os
|
|
19
19
|
from enum import Enum, IntEnum, auto
|
|
20
|
-
from typing import List, Optional, Set, Union
|
|
20
|
+
from typing import Any, List, Optional, Set, Union
|
|
21
21
|
|
|
22
22
|
import torch
|
|
23
23
|
from transformers import PretrainedConfig
|
|
24
24
|
|
|
25
|
-
from sglang.srt.
|
|
25
|
+
from sglang.srt.environ import envs
|
|
26
|
+
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
|
27
|
+
from sglang.srt.server_args import ServerArgs
|
|
28
|
+
from sglang.srt.utils import is_hip, retry
|
|
29
|
+
from sglang.srt.utils.hf_transformers_utils import (
|
|
26
30
|
get_config,
|
|
27
31
|
get_context_length,
|
|
28
32
|
get_generation_config,
|
|
29
33
|
get_hf_text_config,
|
|
30
34
|
get_sparse_attention_config,
|
|
31
35
|
)
|
|
32
|
-
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
|
33
|
-
from sglang.srt.server_args import ServerArgs
|
|
34
|
-
from sglang.srt.utils import get_bool_env_var, is_hip
|
|
35
36
|
from sglang.utils import is_in_ci
|
|
36
37
|
|
|
37
38
|
logger = logging.getLogger(__name__)
|
|
@@ -48,6 +49,34 @@ class ModelImpl(str, Enum):
|
|
|
48
49
|
TRANSFORMERS = "transformers"
|
|
49
50
|
|
|
50
51
|
|
|
52
|
+
def is_deepseek_nsa(config: PretrainedConfig) -> bool:
|
|
53
|
+
return (
|
|
54
|
+
config.architectures is not None
|
|
55
|
+
and config.architectures[0]
|
|
56
|
+
in [
|
|
57
|
+
"DeepseekV3ForCausalLM",
|
|
58
|
+
"DeepseekV32ForCausalLM",
|
|
59
|
+
"DeepseekV3ForCausalLMNextN",
|
|
60
|
+
]
|
|
61
|
+
and getattr(config, "index_topk", None) is not None
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
|
|
66
|
+
assert is_deepseek_nsa(config)
|
|
67
|
+
return config.index_head_dim
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_nsa_index_topk(config: PretrainedConfig) -> int:
|
|
71
|
+
assert is_deepseek_nsa(config)
|
|
72
|
+
return config.index_topk
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
|
|
76
|
+
assert is_deepseek_nsa(config)
|
|
77
|
+
return config.index_n_heads
|
|
78
|
+
|
|
79
|
+
|
|
51
80
|
class ModelConfig:
|
|
52
81
|
def __init__(
|
|
53
82
|
self,
|
|
@@ -62,37 +91,31 @@ class ModelConfig:
|
|
|
62
91
|
quantization: Optional[str] = None,
|
|
63
92
|
override_config_file: Optional[str] = None,
|
|
64
93
|
is_draft_model: bool = False,
|
|
65
|
-
hybrid_kvcache_ratio: Optional[
|
|
94
|
+
hybrid_kvcache_ratio: Optional[
|
|
95
|
+
float
|
|
96
|
+
] = None, # TODO: remove this, it is not a model config
|
|
66
97
|
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None,
|
|
70
|
-
remote_instance_weight_loader_send_weights_group_ports: Optional[
|
|
71
|
-
List[int]
|
|
72
|
-
] = None,
|
|
98
|
+
sampling_defaults: str = "openai",
|
|
99
|
+
quantize_and_serve: bool = False,
|
|
73
100
|
) -> None:
|
|
74
101
|
# Parse args
|
|
75
102
|
self.model_path = model_path
|
|
76
103
|
self.revision = revision
|
|
77
104
|
self.quantization = quantization
|
|
105
|
+
self.is_draft_model = is_draft_model
|
|
78
106
|
self.model_impl = model_impl
|
|
79
|
-
self.
|
|
80
|
-
self.
|
|
81
|
-
remote_instance_weight_loader_seed_instance_ip
|
|
82
|
-
)
|
|
83
|
-
self.remote_instance_weight_loader_seed_instance_service_port = (
|
|
84
|
-
remote_instance_weight_loader_seed_instance_service_port
|
|
85
|
-
)
|
|
86
|
-
self.remote_instance_weight_loader_send_weights_group_ports = (
|
|
87
|
-
remote_instance_weight_loader_send_weights_group_ports
|
|
88
|
-
)
|
|
107
|
+
self.sampling_defaults = sampling_defaults
|
|
108
|
+
self.quantize_and_serve = quantize_and_serve
|
|
89
109
|
|
|
90
|
-
|
|
110
|
+
# Validate quantize_and_serve configuration
|
|
111
|
+
self._validate_quantize_and_serve_config()
|
|
112
|
+
|
|
113
|
+
# Get hf config
|
|
114
|
+
self._maybe_pull_model_tokenizer_from_remote()
|
|
91
115
|
self.model_override_args = json.loads(model_override_args)
|
|
92
116
|
kwargs = {}
|
|
93
117
|
if override_config_file and override_config_file.strip():
|
|
94
118
|
kwargs["_configuration_file"] = override_config_file.strip()
|
|
95
|
-
|
|
96
119
|
self.hf_config = get_config(
|
|
97
120
|
self.model_path,
|
|
98
121
|
trust_remote_code=trust_remote_code,
|
|
@@ -100,7 +123,7 @@ class ModelConfig:
|
|
|
100
123
|
model_override_args=self.model_override_args,
|
|
101
124
|
**kwargs,
|
|
102
125
|
)
|
|
103
|
-
|
|
126
|
+
self.hf_text_config = get_hf_text_config(self.hf_config)
|
|
104
127
|
self.hf_generation_config = get_generation_config(
|
|
105
128
|
self.model_path,
|
|
106
129
|
trust_remote_code=trust_remote_code,
|
|
@@ -108,7 +131,25 @@ class ModelConfig:
|
|
|
108
131
|
**kwargs,
|
|
109
132
|
)
|
|
110
133
|
|
|
111
|
-
|
|
134
|
+
# Set enable_multimodal
|
|
135
|
+
if enable_multimodal is None:
|
|
136
|
+
mm_disabled_models = [
|
|
137
|
+
"Gemma3ForConditionalGeneration",
|
|
138
|
+
"Llama4ForConditionalGeneration",
|
|
139
|
+
"Step3VLForConditionalGeneration",
|
|
140
|
+
]
|
|
141
|
+
if self.hf_config.architectures[0] in mm_disabled_models:
|
|
142
|
+
enable_multimodal = False
|
|
143
|
+
logger.info(
|
|
144
|
+
f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
|
|
145
|
+
)
|
|
146
|
+
else:
|
|
147
|
+
enable_multimodal = True
|
|
148
|
+
|
|
149
|
+
# Config draft model
|
|
150
|
+
self._config_draft_model()
|
|
151
|
+
|
|
152
|
+
# Check model type
|
|
112
153
|
self.attention_chunk_size = getattr(
|
|
113
154
|
self.hf_text_config, "attention_chunk_size", None
|
|
114
155
|
)
|
|
@@ -124,20 +165,72 @@ class ModelConfig:
|
|
|
124
165
|
self.hf_config.architectures, self.hf_text_config.num_hidden_layers
|
|
125
166
|
)
|
|
126
167
|
)
|
|
168
|
+
self.is_generation = is_generation_model(
|
|
169
|
+
self.hf_config.architectures, is_embedding
|
|
170
|
+
)
|
|
171
|
+
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
|
172
|
+
self.hf_config.architectures
|
|
173
|
+
)
|
|
174
|
+
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
|
175
|
+
self.hf_config.architectures
|
|
176
|
+
)
|
|
177
|
+
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
|
178
|
+
self.hf_config.architectures
|
|
179
|
+
)
|
|
180
|
+
self.is_audio_model = enable_multimodal and is_audio_model(
|
|
181
|
+
self.hf_config.architectures
|
|
182
|
+
)
|
|
183
|
+
self.is_multimodal_chunked_prefill_supported = (
|
|
184
|
+
enable_multimodal
|
|
185
|
+
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
|
186
|
+
)
|
|
187
|
+
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
|
188
|
+
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
|
127
189
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
190
|
+
# Derive context length and model shapes
|
|
191
|
+
self._derive_context_length(context_length)
|
|
192
|
+
self._derive_model_shapes()
|
|
193
|
+
|
|
194
|
+
# Verify quantization
|
|
195
|
+
self._verify_quantization()
|
|
196
|
+
|
|
197
|
+
# Verify dual-chunk attention config
|
|
198
|
+
self._verify_dual_chunk_attention_config()
|
|
199
|
+
|
|
200
|
+
# Cache attributes
|
|
201
|
+
self.hf_eos_token_id = self._get_hf_eos_token_id()
|
|
202
|
+
|
|
203
|
+
# multimodal
|
|
204
|
+
self.image_token_id = getattr(
|
|
205
|
+
self.hf_config, "image_token_id", None
|
|
206
|
+
) or getattr(self.hf_config, "image_token_index", None)
|
|
207
|
+
|
|
208
|
+
@staticmethod
|
|
209
|
+
def from_server_args(
|
|
210
|
+
server_args: ServerArgs,
|
|
211
|
+
model_path: str = None,
|
|
212
|
+
model_revision: str = None,
|
|
213
|
+
**kwargs,
|
|
214
|
+
):
|
|
215
|
+
return ModelConfig(
|
|
216
|
+
model_path=model_path or server_args.model_path,
|
|
217
|
+
trust_remote_code=server_args.trust_remote_code,
|
|
218
|
+
revision=model_revision or server_args.revision,
|
|
219
|
+
context_length=server_args.context_length,
|
|
220
|
+
model_override_args=server_args.json_model_override_args,
|
|
221
|
+
is_embedding=server_args.is_embedding,
|
|
222
|
+
enable_multimodal=server_args.enable_multimodal,
|
|
223
|
+
dtype=server_args.dtype,
|
|
224
|
+
quantization=server_args.quantization,
|
|
225
|
+
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
|
226
|
+
model_impl=server_args.model_impl,
|
|
227
|
+
sampling_defaults=server_args.sampling_defaults,
|
|
228
|
+
quantize_and_serve=server_args.quantize_and_serve,
|
|
229
|
+
**kwargs,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def _config_draft_model(self):
|
|
233
|
+
is_draft_model = self.is_draft_model
|
|
141
234
|
|
|
142
235
|
if (
|
|
143
236
|
is_draft_model
|
|
@@ -172,31 +265,10 @@ class ModelConfig:
|
|
|
172
265
|
self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
|
|
173
266
|
self.hf_config.num_nextn_predict_layers = 1
|
|
174
267
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
self.hf_config.architectures, is_embedding
|
|
178
|
-
)
|
|
179
|
-
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
|
180
|
-
self.hf_config.architectures
|
|
181
|
-
)
|
|
182
|
-
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
|
183
|
-
self.hf_config.architectures
|
|
184
|
-
)
|
|
185
|
-
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
|
186
|
-
self.hf_config.architectures
|
|
187
|
-
)
|
|
188
|
-
self.is_audio_model = enable_multimodal and is_audio_model(
|
|
189
|
-
self.hf_config.architectures
|
|
190
|
-
)
|
|
191
|
-
self.is_multimodal_chunked_prefill_supported = (
|
|
192
|
-
enable_multimodal
|
|
193
|
-
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
|
194
|
-
)
|
|
195
|
-
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
|
196
|
-
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
|
197
|
-
|
|
198
|
-
# Derive context length
|
|
268
|
+
def _derive_context_length(self, context_length: int):
|
|
269
|
+
is_draft_model = self.is_draft_model
|
|
199
270
|
derived_context_len = get_context_length(self.hf_text_config)
|
|
271
|
+
|
|
200
272
|
if context_length is not None:
|
|
201
273
|
if context_length > derived_context_len:
|
|
202
274
|
reason = "Target model's" if is_draft_model else "User-specified"
|
|
@@ -205,11 +277,16 @@ class ModelConfig:
|
|
|
205
277
|
f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
|
|
206
278
|
)
|
|
207
279
|
if (
|
|
208
|
-
|
|
280
|
+
envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
|
|
209
281
|
or is_in_ci() # FIXME: fix this special case
|
|
210
282
|
):
|
|
211
283
|
logger.warning(msg)
|
|
212
284
|
self.context_len = context_length
|
|
285
|
+
if is_draft_model:
|
|
286
|
+
self.hf_text_config.max_position_embeddings = context_length
|
|
287
|
+
logger.warning(
|
|
288
|
+
f"Overriding the draft model's max_position_embeddings to {context_length}."
|
|
289
|
+
)
|
|
213
290
|
else:
|
|
214
291
|
raise ValueError(
|
|
215
292
|
f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
|
|
@@ -219,6 +296,10 @@ class ModelConfig:
|
|
|
219
296
|
else:
|
|
220
297
|
self.context_len = derived_context_len
|
|
221
298
|
|
|
299
|
+
# Transfer context_len to HuggingFace config so models can access it
|
|
300
|
+
self.hf_config.context_len = self.context_len
|
|
301
|
+
|
|
302
|
+
def _derive_model_shapes(self):
|
|
222
303
|
# Unify the config keys for hf_text_config
|
|
223
304
|
self.head_dim = getattr(
|
|
224
305
|
self.hf_text_config,
|
|
@@ -229,6 +310,7 @@ class ModelConfig:
|
|
|
229
310
|
# FIXME: temporary special judge for MLA architecture
|
|
230
311
|
if (
|
|
231
312
|
"DeepseekV2ForCausalLM" in self.hf_config.architectures
|
|
313
|
+
or "DeepseekV32ForCausalLM" in self.hf_config.architectures
|
|
232
314
|
or "DeepseekV3ForCausalLM" in self.hf_config.architectures
|
|
233
315
|
or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
|
|
234
316
|
or "LongcatFlashForCausalLM" in self.hf_config.architectures
|
|
@@ -241,6 +323,11 @@ class ModelConfig:
|
|
|
241
323
|
self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
|
|
242
324
|
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
|
|
243
325
|
self.v_head_dim = self.hf_config.v_head_dim
|
|
326
|
+
self.index_head_dim = (
|
|
327
|
+
get_nsa_index_head_dim(self.hf_config)
|
|
328
|
+
if is_deepseek_nsa(self.hf_config)
|
|
329
|
+
else None
|
|
330
|
+
)
|
|
244
331
|
|
|
245
332
|
# Handle rope scaling with yarn
|
|
246
333
|
self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
|
|
@@ -313,45 +400,6 @@ class ModelConfig:
|
|
|
313
400
|
)
|
|
314
401
|
self.vocab_size = self.hf_text_config.vocab_size
|
|
315
402
|
|
|
316
|
-
# Verify quantization
|
|
317
|
-
self._verify_quantization()
|
|
318
|
-
|
|
319
|
-
# Verify dual-chunk attention config
|
|
320
|
-
self._verify_dual_chunk_attention_config()
|
|
321
|
-
|
|
322
|
-
# Cache attributes
|
|
323
|
-
self.hf_eos_token_id = self.get_hf_eos_token_id()
|
|
324
|
-
|
|
325
|
-
# multimodal
|
|
326
|
-
self.image_token_id = getattr(
|
|
327
|
-
self.hf_config, "image_token_id", None
|
|
328
|
-
) or getattr(self.hf_config, "image_token_index", None)
|
|
329
|
-
|
|
330
|
-
@staticmethod
|
|
331
|
-
def from_server_args(
|
|
332
|
-
server_args: ServerArgs,
|
|
333
|
-
model_path: str = None,
|
|
334
|
-
model_revision: str = None,
|
|
335
|
-
**kwargs,
|
|
336
|
-
):
|
|
337
|
-
return ModelConfig(
|
|
338
|
-
model_path=model_path or server_args.model_path,
|
|
339
|
-
trust_remote_code=server_args.trust_remote_code,
|
|
340
|
-
revision=model_revision or server_args.revision,
|
|
341
|
-
context_length=server_args.context_length,
|
|
342
|
-
model_override_args=server_args.json_model_override_args,
|
|
343
|
-
is_embedding=server_args.is_embedding,
|
|
344
|
-
enable_multimodal=server_args.enable_multimodal,
|
|
345
|
-
dtype=server_args.dtype,
|
|
346
|
-
quantization=server_args.quantization,
|
|
347
|
-
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
|
348
|
-
model_impl=server_args.model_impl,
|
|
349
|
-
remote_instance_weight_loader_seed_instance_ip=server_args.remote_instance_weight_loader_seed_instance_ip,
|
|
350
|
-
remote_instance_weight_loader_seed_instance_service_port=server_args.remote_instance_weight_loader_seed_instance_service_port,
|
|
351
|
-
remote_instance_weight_loader_send_weights_group_ports=server_args.remote_instance_weight_loader_send_weights_group_ports,
|
|
352
|
-
**kwargs,
|
|
353
|
-
)
|
|
354
|
-
|
|
355
403
|
def get_total_num_attention_heads(self) -> int:
|
|
356
404
|
return self.num_attention_heads
|
|
357
405
|
|
|
@@ -444,36 +492,114 @@ class ModelConfig:
|
|
|
444
492
|
# example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
|
|
445
493
|
# example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
|
|
446
494
|
is_local = os.path.exists(self.model_path)
|
|
447
|
-
modelopt_quant_config = {"quant_method": "modelopt"}
|
|
448
495
|
if not is_local:
|
|
449
496
|
import huggingface_hub
|
|
450
497
|
|
|
451
498
|
try:
|
|
452
|
-
from huggingface_hub import HfApi
|
|
499
|
+
from huggingface_hub import HfApi, hf_hub_download
|
|
453
500
|
|
|
454
501
|
hf_api = HfApi()
|
|
455
|
-
|
|
456
|
-
|
|
502
|
+
# Retry HF API call up to 3 times
|
|
503
|
+
file_exists = retry(
|
|
504
|
+
lambda: hf_api.file_exists(
|
|
505
|
+
self.model_path, "hf_quant_config.json"
|
|
506
|
+
),
|
|
507
|
+
max_retry=2,
|
|
508
|
+
initial_delay=1.0,
|
|
509
|
+
max_delay=5.0,
|
|
510
|
+
)
|
|
511
|
+
if file_exists:
|
|
512
|
+
# Download and parse the quantization config for remote models
|
|
513
|
+
quant_config_file = hf_hub_download(
|
|
514
|
+
repo_id=self.model_path,
|
|
515
|
+
filename="hf_quant_config.json",
|
|
516
|
+
revision=self.revision,
|
|
517
|
+
)
|
|
518
|
+
with open(quant_config_file) as f:
|
|
519
|
+
quant_config_dict = json.load(f)
|
|
520
|
+
quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
|
|
457
521
|
except huggingface_hub.errors.OfflineModeIsEnabled:
|
|
458
522
|
logger.warning(
|
|
459
523
|
"Offline mode is enabled, skipping hf_quant_config.json check"
|
|
460
524
|
)
|
|
461
|
-
|
|
462
|
-
|
|
525
|
+
except Exception as e:
|
|
526
|
+
logger.warning(
|
|
527
|
+
f"Failed to check hf_quant_config.json: {self.model_path} {e}"
|
|
528
|
+
)
|
|
463
529
|
elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
|
|
464
530
|
quant_config_file = os.path.join(
|
|
465
531
|
self.model_path, "hf_quant_config.json"
|
|
466
532
|
)
|
|
467
533
|
with open(quant_config_file) as f:
|
|
468
534
|
quant_config_dict = json.load(f)
|
|
469
|
-
|
|
470
|
-
quant_algo = json_quant_configs.get("quant_algo", None)
|
|
471
|
-
if quant_algo == "MIXED_PRECISION":
|
|
472
|
-
quant_cfg = {"quant_method": "w4afp8"}
|
|
473
|
-
else:
|
|
474
|
-
quant_cfg = modelopt_quant_config
|
|
535
|
+
quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
|
|
475
536
|
return quant_cfg
|
|
476
537
|
|
|
538
|
+
def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> dict:
|
|
539
|
+
"""Parse ModelOpt quantization config and return the appropriate quant_method."""
|
|
540
|
+
json_quant_configs = quant_config_dict["quantization"]
|
|
541
|
+
quant_algo = json_quant_configs.get("quant_algo", None)
|
|
542
|
+
|
|
543
|
+
if quant_algo == "MIXED_PRECISION":
|
|
544
|
+
return {"quant_method": "w4afp8"}
|
|
545
|
+
elif quant_algo and ("FP4" in quant_algo or "NVFP4" in quant_algo):
|
|
546
|
+
return {"quant_method": "modelopt_fp4"}
|
|
547
|
+
elif quant_algo and "FP8" in quant_algo:
|
|
548
|
+
return {"quant_method": "modelopt_fp8"}
|
|
549
|
+
else:
|
|
550
|
+
# Default to FP8 for backward compatibility
|
|
551
|
+
return {"quant_method": "modelopt_fp8"}
|
|
552
|
+
|
|
553
|
+
def _is_already_quantized(self) -> bool:
|
|
554
|
+
"""Check if the model is already quantized based on config files."""
|
|
555
|
+
# Check for HuggingFace quantization config
|
|
556
|
+
from sglang.srt.utils import has_hf_quant_config
|
|
557
|
+
|
|
558
|
+
return has_hf_quant_config(self.model_path)
|
|
559
|
+
|
|
560
|
+
def _get_modelopt_quant_type(self) -> str:
|
|
561
|
+
"""Extract ModelOpt quantization type from unified quantization flag."""
|
|
562
|
+
if self.quantization == "modelopt_fp8":
|
|
563
|
+
return "fp8"
|
|
564
|
+
elif self.quantization == "modelopt_fp4":
|
|
565
|
+
return "nvfp4"
|
|
566
|
+
elif self.quantization == "modelopt":
|
|
567
|
+
# Auto-detect from model config
|
|
568
|
+
quant_cfg = self._parse_quant_hf_config()
|
|
569
|
+
if quant_cfg:
|
|
570
|
+
quant_method = quant_cfg.get("quant_method", "").lower()
|
|
571
|
+
if "fp4" in quant_method:
|
|
572
|
+
return "fp4"
|
|
573
|
+
elif "fp8" in quant_method:
|
|
574
|
+
return "fp8"
|
|
575
|
+
# Default to fp8 if can't detect
|
|
576
|
+
return "fp8"
|
|
577
|
+
else:
|
|
578
|
+
return "fp8" # Default fallback
|
|
579
|
+
|
|
580
|
+
def _validate_quantize_and_serve_config(self):
|
|
581
|
+
"""Validate quantize_and_serve configuration."""
|
|
582
|
+
if not self.quantize_and_serve:
|
|
583
|
+
return
|
|
584
|
+
|
|
585
|
+
# Check if ModelOpt quantization is specified
|
|
586
|
+
modelopt_quantization_specified = self.quantization in [
|
|
587
|
+
"modelopt",
|
|
588
|
+
"modelopt_fp8",
|
|
589
|
+
"modelopt_fp4",
|
|
590
|
+
]
|
|
591
|
+
|
|
592
|
+
if not modelopt_quantization_specified:
|
|
593
|
+
raise ValueError("quantize_and_serve requires ModelOpt quantization")
|
|
594
|
+
|
|
595
|
+
# quantize_and_serve is disabled due to compatibility issues
|
|
596
|
+
raise NotImplementedError(
|
|
597
|
+
"quantize_and_serve functionality is currently disabled due to compatibility issues. "
|
|
598
|
+
"Please use the separate quantize-then-deploy workflow instead. "
|
|
599
|
+
"Step 1: Quantize and export model. "
|
|
600
|
+
"Step 2: Deploy the exported model."
|
|
601
|
+
)
|
|
602
|
+
|
|
477
603
|
# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
|
|
478
604
|
def _verify_quantization(self) -> None:
|
|
479
605
|
supported_quantization = [*QUANTIZATION_METHODS]
|
|
@@ -492,7 +618,8 @@ class ModelConfig:
|
|
|
492
618
|
optimized_quantization_methods = [
|
|
493
619
|
"fp8",
|
|
494
620
|
"marlin",
|
|
495
|
-
"
|
|
621
|
+
"modelopt_fp8",
|
|
622
|
+
"modelopt_fp4",
|
|
496
623
|
"gptq_marlin_24",
|
|
497
624
|
"gptq_marlin",
|
|
498
625
|
"awq_marlin",
|
|
@@ -586,7 +713,7 @@ class ModelConfig:
|
|
|
586
713
|
"sparse_attention_enabled"
|
|
587
714
|
] = True
|
|
588
715
|
|
|
589
|
-
def
|
|
716
|
+
def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
|
|
590
717
|
eos_ids = getattr(self.hf_config, "eos_token_id", None)
|
|
591
718
|
if eos_ids is not None:
|
|
592
719
|
# it can be either int or list of int
|
|
@@ -606,7 +733,39 @@ class ModelConfig:
|
|
|
606
733
|
eos_ids = eos_ids | generation_eos_ids
|
|
607
734
|
return eos_ids
|
|
608
735
|
|
|
609
|
-
def
|
|
736
|
+
def get_default_sampling_params(self) -> dict[str, Any]:
|
|
737
|
+
"""
|
|
738
|
+
Get default sampling parameters from the model's generation config.
|
|
739
|
+
|
|
740
|
+
This method returns non-default sampling parameters from the model's
|
|
741
|
+
generation_config.json when sampling_defaults is set to "model".
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
A dictionary containing the non-default sampling parameters.
|
|
745
|
+
"""
|
|
746
|
+
if self.sampling_defaults != "model":
|
|
747
|
+
return {}
|
|
748
|
+
|
|
749
|
+
if self.hf_generation_config is None:
|
|
750
|
+
return {}
|
|
751
|
+
|
|
752
|
+
config = self.hf_generation_config.to_dict()
|
|
753
|
+
|
|
754
|
+
available_params = [
|
|
755
|
+
"repetition_penalty",
|
|
756
|
+
"temperature",
|
|
757
|
+
"top_k",
|
|
758
|
+
"top_p",
|
|
759
|
+
"min_p",
|
|
760
|
+
]
|
|
761
|
+
|
|
762
|
+
default_sampling_params = {
|
|
763
|
+
p: config.get(p) for p in available_params if config.get(p) is not None
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
return default_sampling_params
|
|
767
|
+
|
|
768
|
+
def _maybe_pull_model_tokenizer_from_remote(self) -> None:
|
|
610
769
|
"""
|
|
611
770
|
Pull the model config files to a temporary
|
|
612
771
|
directory in case of remote.
|
|
@@ -749,13 +908,20 @@ multimodal_model_archs = [
|
|
|
749
908
|
"Qwen2AudioForConditionalGeneration",
|
|
750
909
|
"Qwen2VLForConditionalGeneration",
|
|
751
910
|
"Qwen2_5_VLForConditionalGeneration",
|
|
911
|
+
"Qwen3VLForConditionalGeneration",
|
|
912
|
+
"Qwen3VLMoeForConditionalGeneration",
|
|
913
|
+
"Qwen3OmniMoeForConditionalGeneration",
|
|
752
914
|
"KimiVLForConditionalGeneration",
|
|
753
915
|
"InternVLChatModel",
|
|
754
916
|
"InternS1ForConditionalGeneration",
|
|
755
917
|
"Phi4MMForCausalLM",
|
|
756
918
|
"VILAForConditionalGeneration",
|
|
757
919
|
"Step3VLForConditionalGeneration",
|
|
920
|
+
"POINTSV15ChatModel",
|
|
758
921
|
"DotsVLMForCausalLM",
|
|
922
|
+
"DotsOCRForCausalLM",
|
|
923
|
+
"Sarashina2VisionForCausalLM",
|
|
924
|
+
"DeepseekOCRForCausalLM",
|
|
759
925
|
]
|
|
760
926
|
|
|
761
927
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Configuration for NVIDIA ModelOpt quantization integration
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class ModelOptConfig:
|
|
8
|
+
"""Configuration for NVIDIA ModelOpt quantization operations.
|
|
9
|
+
|
|
10
|
+
This configuration class holds parameters for ModelOpt quantization,
|
|
11
|
+
checkpoint management, and model export operations.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
quant: Quantization method/type (e.g., "fp8", "fp4")
|
|
15
|
+
checkpoint_restore_path: Path to restore ModelOpt checkpoint from
|
|
16
|
+
checkpoint_save_path: Path to save ModelOpt checkpoint to
|
|
17
|
+
export_path: Path to export quantized model in HuggingFace format
|
|
18
|
+
quantize_and_serve: Whether to quantize and serve in one step
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
quant: Optional[str] = None
|
|
22
|
+
checkpoint_restore_path: Optional[str] = None
|
|
23
|
+
checkpoint_save_path: Optional[str] = None
|
|
24
|
+
export_path: Optional[str] = None
|
|
25
|
+
quantize_and_serve: bool = False
|
|
26
|
+
|
|
27
|
+
def __post_init__(self):
|
|
28
|
+
"""Validate configuration after initialization."""
|
|
29
|
+
# Add any validation logic if needed
|
|
30
|
+
pass
|