sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -17,21 +17,22 @@ import logging
|
|
17
17
|
import math
|
18
18
|
import os
|
19
19
|
from enum import Enum, IntEnum, auto
|
20
|
-
from typing import List, Optional, Set, Union
|
20
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
21
21
|
|
22
22
|
import torch
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
|
25
|
-
from sglang.srt.
|
25
|
+
from sglang.srt.environ import envs
|
26
|
+
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
27
|
+
from sglang.srt.server_args import ServerArgs
|
28
|
+
from sglang.srt.utils import is_hip, retry
|
29
|
+
from sglang.srt.utils.hf_transformers_utils import (
|
26
30
|
get_config,
|
27
31
|
get_context_length,
|
28
32
|
get_generation_config,
|
29
33
|
get_hf_text_config,
|
30
34
|
get_sparse_attention_config,
|
31
35
|
)
|
32
|
-
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
33
|
-
from sglang.srt.server_args import ServerArgs
|
34
|
-
from sglang.srt.utils import get_bool_env_var, is_hip
|
35
36
|
from sglang.utils import is_in_ci
|
36
37
|
|
37
38
|
logger = logging.getLogger(__name__)
|
@@ -48,6 +49,30 @@ class ModelImpl(str, Enum):
|
|
48
49
|
TRANSFORMERS = "transformers"
|
49
50
|
|
50
51
|
|
52
|
+
def is_deepseek_nsa(config: PretrainedConfig) -> bool:
|
53
|
+
return (
|
54
|
+
config.architectures is not None
|
55
|
+
and config.architectures[0]
|
56
|
+
in ["DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM"]
|
57
|
+
and getattr(config, "index_topk", None) is not None
|
58
|
+
)
|
59
|
+
|
60
|
+
|
61
|
+
def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
|
62
|
+
assert is_deepseek_nsa(config)
|
63
|
+
return config.index_head_dim
|
64
|
+
|
65
|
+
|
66
|
+
def get_nsa_index_topk(config: PretrainedConfig) -> int:
|
67
|
+
assert is_deepseek_nsa(config)
|
68
|
+
return config.index_topk
|
69
|
+
|
70
|
+
|
71
|
+
def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
|
72
|
+
assert is_deepseek_nsa(config)
|
73
|
+
return config.index_n_heads
|
74
|
+
|
75
|
+
|
51
76
|
class ModelConfig:
|
52
77
|
def __init__(
|
53
78
|
self,
|
@@ -60,23 +85,28 @@ class ModelConfig:
|
|
60
85
|
enable_multimodal: Optional[bool] = None,
|
61
86
|
dtype: str = "auto",
|
62
87
|
quantization: Optional[str] = None,
|
88
|
+
modelopt_quant: Optional[Union[str, Dict]] = None,
|
63
89
|
override_config_file: Optional[str] = None,
|
64
90
|
is_draft_model: bool = False,
|
65
91
|
hybrid_kvcache_ratio: Optional[float] = None,
|
66
92
|
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
|
93
|
+
sampling_defaults: str = "openai",
|
67
94
|
) -> None:
|
68
95
|
# Parse args
|
69
96
|
self.model_path = model_path
|
70
97
|
self.revision = revision
|
71
98
|
self.quantization = quantization
|
99
|
+
self.modelopt_quant = modelopt_quant
|
100
|
+
self.is_draft_model = is_draft_model
|
72
101
|
self.model_impl = model_impl
|
102
|
+
self.sampling_defaults = sampling_defaults
|
73
103
|
|
74
|
-
|
104
|
+
# Get hf config
|
105
|
+
self._maybe_pull_model_tokenizer_from_remote()
|
75
106
|
self.model_override_args = json.loads(model_override_args)
|
76
107
|
kwargs = {}
|
77
108
|
if override_config_file and override_config_file.strip():
|
78
109
|
kwargs["_configuration_file"] = override_config_file.strip()
|
79
|
-
|
80
110
|
self.hf_config = get_config(
|
81
111
|
self.model_path,
|
82
112
|
trust_remote_code=trust_remote_code,
|
@@ -84,7 +114,7 @@ class ModelConfig:
|
|
84
114
|
model_override_args=self.model_override_args,
|
85
115
|
**kwargs,
|
86
116
|
)
|
87
|
-
|
117
|
+
self.hf_text_config = get_hf_text_config(self.hf_config)
|
88
118
|
self.hf_generation_config = get_generation_config(
|
89
119
|
self.model_path,
|
90
120
|
trust_remote_code=trust_remote_code,
|
@@ -92,7 +122,25 @@ class ModelConfig:
|
|
92
122
|
**kwargs,
|
93
123
|
)
|
94
124
|
|
95
|
-
|
125
|
+
# Set enable_multimodal
|
126
|
+
if enable_multimodal is None:
|
127
|
+
mm_disabled_models = [
|
128
|
+
"Gemma3ForConditionalGeneration",
|
129
|
+
"Llama4ForConditionalGeneration",
|
130
|
+
"Step3VLForConditionalGeneration",
|
131
|
+
]
|
132
|
+
if self.hf_config.architectures[0] in mm_disabled_models:
|
133
|
+
enable_multimodal = False
|
134
|
+
logger.info(
|
135
|
+
f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
|
136
|
+
)
|
137
|
+
else:
|
138
|
+
enable_multimodal = True
|
139
|
+
|
140
|
+
# Config draft model
|
141
|
+
self._config_draft_model()
|
142
|
+
|
143
|
+
# Check model type
|
96
144
|
self.attention_chunk_size = getattr(
|
97
145
|
self.hf_text_config, "attention_chunk_size", None
|
98
146
|
)
|
@@ -108,20 +156,72 @@ class ModelConfig:
|
|
108
156
|
self.hf_config.architectures, self.hf_text_config.num_hidden_layers
|
109
157
|
)
|
110
158
|
)
|
159
|
+
self.is_generation = is_generation_model(
|
160
|
+
self.hf_config.architectures, is_embedding
|
161
|
+
)
|
162
|
+
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
163
|
+
self.hf_config.architectures
|
164
|
+
)
|
165
|
+
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
166
|
+
self.hf_config.architectures
|
167
|
+
)
|
168
|
+
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
169
|
+
self.hf_config.architectures
|
170
|
+
)
|
171
|
+
self.is_audio_model = enable_multimodal and is_audio_model(
|
172
|
+
self.hf_config.architectures
|
173
|
+
)
|
174
|
+
self.is_multimodal_chunked_prefill_supported = (
|
175
|
+
enable_multimodal
|
176
|
+
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
177
|
+
)
|
178
|
+
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
179
|
+
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
111
180
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
181
|
+
# Derive context length and model shapes
|
182
|
+
self._derive_context_length(context_length)
|
183
|
+
self._derive_model_shapes()
|
184
|
+
|
185
|
+
# Verify quantization
|
186
|
+
self._verify_quantization()
|
187
|
+
|
188
|
+
# Verify dual-chunk attention config
|
189
|
+
self._verify_dual_chunk_attention_config()
|
190
|
+
|
191
|
+
# Cache attributes
|
192
|
+
self.hf_eos_token_id = self._get_hf_eos_token_id()
|
193
|
+
|
194
|
+
# multimodal
|
195
|
+
self.image_token_id = getattr(
|
196
|
+
self.hf_config, "image_token_id", None
|
197
|
+
) or getattr(self.hf_config, "image_token_index", None)
|
198
|
+
|
199
|
+
@staticmethod
|
200
|
+
def from_server_args(
|
201
|
+
server_args: ServerArgs,
|
202
|
+
model_path: str = None,
|
203
|
+
model_revision: str = None,
|
204
|
+
**kwargs,
|
205
|
+
):
|
206
|
+
return ModelConfig(
|
207
|
+
model_path=model_path or server_args.model_path,
|
208
|
+
trust_remote_code=server_args.trust_remote_code,
|
209
|
+
revision=model_revision or server_args.revision,
|
210
|
+
context_length=server_args.context_length,
|
211
|
+
model_override_args=server_args.json_model_override_args,
|
212
|
+
is_embedding=server_args.is_embedding,
|
213
|
+
enable_multimodal=server_args.enable_multimodal,
|
214
|
+
dtype=server_args.dtype,
|
215
|
+
quantization=server_args.quantization,
|
216
|
+
modelopt_quant=server_args.modelopt_quant,
|
217
|
+
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
218
|
+
model_impl=server_args.model_impl,
|
219
|
+
sampling_defaults=server_args.sampling_defaults,
|
220
|
+
**kwargs,
|
221
|
+
)
|
222
|
+
|
223
|
+
def _config_draft_model(self):
|
224
|
+
is_draft_model = self.is_draft_model
|
125
225
|
|
126
226
|
if (
|
127
227
|
is_draft_model
|
@@ -141,37 +241,25 @@ class ModelConfig:
|
|
141
241
|
|
142
242
|
if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
|
143
243
|
self.hf_config.architectures[0] = "MiMoMTP"
|
244
|
+
if is_draft_model and self.hf_config.architectures[0] in [
|
245
|
+
"BailingMoeV2ForCausalLM",
|
246
|
+
"BailingMoeForCausalLM",
|
247
|
+
]:
|
248
|
+
self.hf_config.architectures[0] = "BailingMoeForCausalLMNextN"
|
144
249
|
if (
|
145
250
|
is_draft_model
|
146
251
|
and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
|
147
252
|
):
|
148
253
|
self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
|
149
254
|
|
150
|
-
|
151
|
-
|
152
|
-
self.hf_config.
|
153
|
-
)
|
154
|
-
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
155
|
-
self.hf_config.architectures
|
156
|
-
)
|
157
|
-
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
158
|
-
self.hf_config.architectures
|
159
|
-
)
|
160
|
-
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
161
|
-
self.hf_config.architectures
|
162
|
-
)
|
163
|
-
self.is_audio_model = enable_multimodal and is_audio_model(
|
164
|
-
self.hf_config.architectures
|
165
|
-
)
|
166
|
-
self.is_multimodal_chunked_prefill_supported = (
|
167
|
-
enable_multimodal
|
168
|
-
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
169
|
-
)
|
170
|
-
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
171
|
-
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
255
|
+
if is_draft_model and self.hf_config.architectures[0] == "Qwen3NextForCausalLM":
|
256
|
+
self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
|
257
|
+
self.hf_config.num_nextn_predict_layers = 1
|
172
258
|
|
173
|
-
|
259
|
+
def _derive_context_length(self, context_length: int):
|
260
|
+
is_draft_model = self.is_draft_model
|
174
261
|
derived_context_len = get_context_length(self.hf_text_config)
|
262
|
+
|
175
263
|
if context_length is not None:
|
176
264
|
if context_length > derived_context_len:
|
177
265
|
reason = "Target model's" if is_draft_model else "User-specified"
|
@@ -180,11 +268,16 @@ class ModelConfig:
|
|
180
268
|
f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
|
181
269
|
)
|
182
270
|
if (
|
183
|
-
|
271
|
+
envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
|
184
272
|
or is_in_ci() # FIXME: fix this special case
|
185
273
|
):
|
186
274
|
logger.warning(msg)
|
187
275
|
self.context_len = context_length
|
276
|
+
if is_draft_model:
|
277
|
+
self.hf_text_config.max_position_embeddings = context_length
|
278
|
+
logger.warning(
|
279
|
+
f"Overriding the draft model's max_position_embeddings to {context_length}."
|
280
|
+
)
|
188
281
|
else:
|
189
282
|
raise ValueError(
|
190
283
|
f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
|
@@ -194,6 +287,10 @@ class ModelConfig:
|
|
194
287
|
else:
|
195
288
|
self.context_len = derived_context_len
|
196
289
|
|
290
|
+
# Transfer context_len to HuggingFace config so models can access it
|
291
|
+
self.hf_config.context_len = self.context_len
|
292
|
+
|
293
|
+
def _derive_model_shapes(self):
|
197
294
|
# Unify the config keys for hf_text_config
|
198
295
|
self.head_dim = getattr(
|
199
296
|
self.hf_text_config,
|
@@ -204,10 +301,12 @@ class ModelConfig:
|
|
204
301
|
# FIXME: temporary special judge for MLA architecture
|
205
302
|
if (
|
206
303
|
"DeepseekV2ForCausalLM" in self.hf_config.architectures
|
304
|
+
or "DeepseekV32ForCausalLM" in self.hf_config.architectures
|
207
305
|
or "DeepseekV3ForCausalLM" in self.hf_config.architectures
|
208
306
|
or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
|
209
307
|
or "LongcatFlashForCausalLM" in self.hf_config.architectures
|
210
308
|
or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
|
309
|
+
or "DotsVLMForCausalLM" in self.hf_config.architectures
|
211
310
|
):
|
212
311
|
self.head_dim = 256
|
213
312
|
self.attention_arch = AttentionArch.MLA
|
@@ -215,6 +314,11 @@ class ModelConfig:
|
|
215
314
|
self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
|
216
315
|
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
|
217
316
|
self.v_head_dim = self.hf_config.v_head_dim
|
317
|
+
self.index_head_dim = (
|
318
|
+
get_nsa_index_head_dim(self.hf_config)
|
319
|
+
if is_deepseek_nsa(self.hf_config)
|
320
|
+
else None
|
321
|
+
)
|
218
322
|
|
219
323
|
# Handle rope scaling with yarn
|
220
324
|
self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
|
@@ -287,37 +391,6 @@ class ModelConfig:
|
|
287
391
|
)
|
288
392
|
self.vocab_size = self.hf_text_config.vocab_size
|
289
393
|
|
290
|
-
# Verify quantization
|
291
|
-
self._verify_quantization()
|
292
|
-
|
293
|
-
# Verify dual-chunk attention config
|
294
|
-
self._verify_dual_chunk_attention_config()
|
295
|
-
|
296
|
-
# Cache attributes
|
297
|
-
self.hf_eos_token_id = self.get_hf_eos_token_id()
|
298
|
-
|
299
|
-
# multimodal
|
300
|
-
self.image_token_id = getattr(
|
301
|
-
self.hf_config, "image_token_id", None
|
302
|
-
) or getattr(self.hf_config, "image_token_index", None)
|
303
|
-
|
304
|
-
@staticmethod
|
305
|
-
def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
|
306
|
-
return ModelConfig(
|
307
|
-
model_path=model_path or server_args.model_path,
|
308
|
-
trust_remote_code=server_args.trust_remote_code,
|
309
|
-
revision=server_args.revision,
|
310
|
-
context_length=server_args.context_length,
|
311
|
-
model_override_args=server_args.json_model_override_args,
|
312
|
-
is_embedding=server_args.is_embedding,
|
313
|
-
enable_multimodal=server_args.enable_multimodal,
|
314
|
-
dtype=server_args.dtype,
|
315
|
-
quantization=server_args.quantization,
|
316
|
-
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
317
|
-
model_impl=server_args.model_impl,
|
318
|
-
**kwargs,
|
319
|
-
)
|
320
|
-
|
321
394
|
def get_total_num_attention_heads(self) -> int:
|
322
395
|
return self.num_attention_heads
|
323
396
|
|
@@ -410,27 +483,52 @@ class ModelConfig:
|
|
410
483
|
# example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
|
411
484
|
# example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
|
412
485
|
is_local = os.path.exists(self.model_path)
|
413
|
-
modelopt_quant_config = {"quant_method": "modelopt"}
|
414
486
|
if not is_local:
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
487
|
+
import huggingface_hub
|
488
|
+
|
489
|
+
try:
|
490
|
+
from huggingface_hub import HfApi, hf_hub_download
|
491
|
+
|
492
|
+
hf_api = HfApi()
|
493
|
+
if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
|
494
|
+
# Download and parse the quantization config for remote models
|
495
|
+
quant_config_file = hf_hub_download(
|
496
|
+
repo_id=self.model_path,
|
497
|
+
filename="hf_quant_config.json",
|
498
|
+
revision=self.revision,
|
499
|
+
)
|
500
|
+
with open(quant_config_file) as f:
|
501
|
+
quant_config_dict = json.load(f)
|
502
|
+
quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
|
503
|
+
except huggingface_hub.errors.OfflineModeIsEnabled:
|
504
|
+
logger.warning(
|
505
|
+
"Offline mode is enabled, skipping hf_quant_config.json check"
|
506
|
+
)
|
507
|
+
pass
|
420
508
|
elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
|
421
509
|
quant_config_file = os.path.join(
|
422
510
|
self.model_path, "hf_quant_config.json"
|
423
511
|
)
|
424
512
|
with open(quant_config_file) as f:
|
425
513
|
quant_config_dict = json.load(f)
|
426
|
-
|
427
|
-
quant_algo = json_quant_configs.get("quant_algo", None)
|
428
|
-
if quant_algo == "MIXED_PRECISION":
|
429
|
-
quant_cfg = {"quant_method": "w4afp8"}
|
430
|
-
else:
|
431
|
-
quant_cfg = modelopt_quant_config
|
514
|
+
quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
|
432
515
|
return quant_cfg
|
433
516
|
|
517
|
+
def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> dict:
|
518
|
+
"""Parse ModelOpt quantization config and return the appropriate quant_method."""
|
519
|
+
json_quant_configs = quant_config_dict["quantization"]
|
520
|
+
quant_algo = json_quant_configs.get("quant_algo", None)
|
521
|
+
|
522
|
+
if quant_algo == "MIXED_PRECISION":
|
523
|
+
return {"quant_method": "w4afp8"}
|
524
|
+
elif quant_algo and ("FP4" in quant_algo or "NVFP4" in quant_algo):
|
525
|
+
return {"quant_method": "modelopt_fp4"}
|
526
|
+
elif quant_algo and "FP8" in quant_algo:
|
527
|
+
return {"quant_method": "modelopt_fp8"}
|
528
|
+
else:
|
529
|
+
# Default to FP8 for backward compatibility
|
530
|
+
return {"quant_method": "modelopt_fp8"}
|
531
|
+
|
434
532
|
# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
|
435
533
|
def _verify_quantization(self) -> None:
|
436
534
|
supported_quantization = [*QUANTIZATION_METHODS]
|
@@ -449,7 +547,8 @@ class ModelConfig:
|
|
449
547
|
optimized_quantization_methods = [
|
450
548
|
"fp8",
|
451
549
|
"marlin",
|
452
|
-
"
|
550
|
+
"modelopt_fp8",
|
551
|
+
"modelopt_fp4",
|
453
552
|
"gptq_marlin_24",
|
454
553
|
"gptq_marlin",
|
455
554
|
"awq_marlin",
|
@@ -543,7 +642,7 @@ class ModelConfig:
|
|
543
642
|
"sparse_attention_enabled"
|
544
643
|
] = True
|
545
644
|
|
546
|
-
def
|
645
|
+
def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
|
547
646
|
eos_ids = getattr(self.hf_config, "eos_token_id", None)
|
548
647
|
if eos_ids is not None:
|
549
648
|
# it can be either int or list of int
|
@@ -563,7 +662,39 @@ class ModelConfig:
|
|
563
662
|
eos_ids = eos_ids | generation_eos_ids
|
564
663
|
return eos_ids
|
565
664
|
|
566
|
-
def
|
665
|
+
def get_default_sampling_params(self) -> dict[str, Any]:
|
666
|
+
"""
|
667
|
+
Get default sampling parameters from the model's generation config.
|
668
|
+
|
669
|
+
This method returns non-default sampling parameters from the model's
|
670
|
+
generation_config.json when sampling_defaults is set to "model".
|
671
|
+
|
672
|
+
Returns:
|
673
|
+
A dictionary containing the non-default sampling parameters.
|
674
|
+
"""
|
675
|
+
if self.sampling_defaults != "model":
|
676
|
+
return {}
|
677
|
+
|
678
|
+
if self.hf_generation_config is None:
|
679
|
+
return {}
|
680
|
+
|
681
|
+
config = self.hf_generation_config.to_dict()
|
682
|
+
|
683
|
+
available_params = [
|
684
|
+
"repetition_penalty",
|
685
|
+
"temperature",
|
686
|
+
"top_k",
|
687
|
+
"top_p",
|
688
|
+
"min_p",
|
689
|
+
]
|
690
|
+
|
691
|
+
default_sampling_params = {
|
692
|
+
p: config.get(p) for p in available_params if config.get(p) is not None
|
693
|
+
}
|
694
|
+
|
695
|
+
return default_sampling_params
|
696
|
+
|
697
|
+
def _maybe_pull_model_tokenizer_from_remote(self) -> None:
|
567
698
|
"""
|
568
699
|
Pull the model config files to a temporary
|
569
700
|
directory in case of remote.
|
@@ -706,12 +837,17 @@ multimodal_model_archs = [
|
|
706
837
|
"Qwen2AudioForConditionalGeneration",
|
707
838
|
"Qwen2VLForConditionalGeneration",
|
708
839
|
"Qwen2_5_VLForConditionalGeneration",
|
840
|
+
"Qwen3VLForConditionalGeneration",
|
841
|
+
"Qwen3VLMoeForConditionalGeneration",
|
709
842
|
"KimiVLForConditionalGeneration",
|
710
843
|
"InternVLChatModel",
|
711
844
|
"InternS1ForConditionalGeneration",
|
712
845
|
"Phi4MMForCausalLM",
|
713
846
|
"VILAForConditionalGeneration",
|
714
847
|
"Step3VLForConditionalGeneration",
|
848
|
+
"DotsVLMForCausalLM",
|
849
|
+
"DotsOCRForCausalLM",
|
850
|
+
"Sarashina2VisionForCausalLM",
|
715
851
|
]
|
716
852
|
|
717
853
|
|