sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +321 -31
- sglang/bench_serving.py +10 -3
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +8 -0
- sglang/srt/configs/model_config.py +160 -105
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/constrained/base_grammar_backend.py +1 -0
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +6 -4
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/common/conn.py +266 -98
- sglang/srt/disaggregation/decode.py +50 -9
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/mooncake/conn.py +51 -541
- sglang/srt/disaggregation/nixl/conn.py +148 -39
- sglang/srt/disaggregation/prefill.py +31 -14
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +135 -80
- sglang/srt/entrypoints/engine.py +23 -3
- sglang/srt/entrypoints/grpc_request_manager.py +330 -55
- sglang/srt/entrypoints/grpc_server.py +232 -102
- sglang/srt/entrypoints/http_server.py +49 -9
- sglang/srt/entrypoints/openai/protocol.py +110 -5
- sglang/srt/entrypoints/openai/serving_base.py +25 -6
- sglang/srt/entrypoints/openai/serving_chat.py +178 -49
- sglang/srt/entrypoints/openai/serving_completions.py +5 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +42 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/function_call/function_call_parser.py +3 -2
- sglang/srt/function_call/glm4_moe_detector.py +3 -3
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
- sglang/srt/layers/activation.py +7 -6
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +108 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +112 -194
- sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
- sglang/srt/layers/attention/mamba/mamba.py +566 -1
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +42 -9
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +11 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +2 -0
- sglang/srt/layers/linear.py +21 -4
- sglang/srt/layers/logits_processor.py +15 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +147 -74
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
- sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
- sglang/srt/layers/moe/utils.py +10 -0
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +1 -1
- sglang/srt/layers/quantization/modelopt_quant.py +44 -9
- sglang/srt/layers/quantization/mxfp4.py +12 -4
- sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
- sglang/srt/layers/quantization/w4afp8.py +0 -4
- sglang/srt/layers/quantization/w8a8_int8.py +15 -3
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +52 -4
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +10 -4
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +17 -6
- sglang/srt/lora/mem_pool.py +1 -1
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +42 -142
- sglang/srt/managers/data_parallel_controller.py +11 -46
- sglang/srt/managers/detokenizer_manager.py +11 -11
- sglang/srt/managers/io_struct.py +162 -118
- sglang/srt/managers/mm_utils.py +43 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +167 -86
- sglang/srt/managers/schedule_policy.py +143 -16
- sglang/srt/managers/scheduler.py +359 -214
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
- sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
- sglang/srt/managers/tokenizer_manager.py +84 -136
- sglang/srt/managers/tp_worker.py +39 -29
- sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +40 -1
- sglang/srt/mem_cache/hiradix_cache.py +119 -32
- sglang/srt/mem_cache/memory_pool.py +188 -10
- sglang/srt/mem_cache/memory_pool_host.py +134 -182
- sglang/srt/mem_cache/radix_cache.py +222 -71
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
- sglang/srt/mem_cache/swa_radix_cache.py +25 -34
- sglang/srt/metrics/collector.py +82 -120
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +39 -32
- sglang/srt/model_executor/forward_batch_info.py +23 -38
- sglang/srt/model_executor/model_runner.py +131 -183
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/loader.py +14 -10
- sglang/srt/model_loader/weight_utils.py +156 -2
- sglang/srt/models/bailing_moe.py +27 -4
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +536 -153
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +3 -3
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +1 -1
- sglang/srt/models/glm4v_moe.py +1 -1
- sglang/srt/models/gpt_oss.py +7 -30
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/longcat_flash.py +1 -1
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +15 -4
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +2 -2
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +64 -1
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +31 -3
- sglang/srt/models/qwen3_next.py +36 -9
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +2 -3
- sglang/srt/multimodal/processors/internvl.py +20 -8
- sglang/srt/multimodal/processors/qwen_vl.py +8 -1
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +20 -2
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +753 -295
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
- sglang/srt/speculative/eagle_worker.py +57 -25
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +47 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +399 -74
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +1 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +12 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +355 -4
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -22,16 +22,17 @@ from typing import List, Optional, Set, Union
|
|
22
22
|
import torch
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
|
25
|
-
from sglang.srt.
|
25
|
+
from sglang.srt.environ import envs
|
26
|
+
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
27
|
+
from sglang.srt.server_args import ServerArgs
|
28
|
+
from sglang.srt.utils import is_hip, retry
|
29
|
+
from sglang.srt.utils.hf_transformers_utils import (
|
26
30
|
get_config,
|
27
31
|
get_context_length,
|
28
32
|
get_generation_config,
|
29
33
|
get_hf_text_config,
|
30
34
|
get_sparse_attention_config,
|
31
35
|
)
|
32
|
-
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
33
|
-
from sglang.srt.server_args import ServerArgs
|
34
|
-
from sglang.srt.utils import get_bool_env_var, is_hip
|
35
36
|
from sglang.utils import is_in_ci
|
36
37
|
|
37
38
|
logger = logging.getLogger(__name__)
|
@@ -48,6 +49,30 @@ class ModelImpl(str, Enum):
|
|
48
49
|
TRANSFORMERS = "transformers"
|
49
50
|
|
50
51
|
|
52
|
+
def is_deepseek_nsa(config: PretrainedConfig) -> bool:
|
53
|
+
return (
|
54
|
+
config.architectures is not None
|
55
|
+
and config.architectures[0]
|
56
|
+
in ["DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM"]
|
57
|
+
and getattr(config, "index_topk", None) is not None
|
58
|
+
)
|
59
|
+
|
60
|
+
|
61
|
+
def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
|
62
|
+
assert is_deepseek_nsa(config)
|
63
|
+
return config.index_head_dim
|
64
|
+
|
65
|
+
|
66
|
+
def get_nsa_index_topk(config: PretrainedConfig) -> int:
|
67
|
+
assert is_deepseek_nsa(config)
|
68
|
+
return config.index_topk
|
69
|
+
|
70
|
+
|
71
|
+
def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
|
72
|
+
assert is_deepseek_nsa(config)
|
73
|
+
return config.index_n_heads
|
74
|
+
|
75
|
+
|
51
76
|
class ModelConfig:
|
52
77
|
def __init__(
|
53
78
|
self,
|
@@ -64,35 +89,20 @@ class ModelConfig:
|
|
64
89
|
is_draft_model: bool = False,
|
65
90
|
hybrid_kvcache_ratio: Optional[float] = None,
|
66
91
|
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
|
67
|
-
tp_rank: Optional[int] = None,
|
68
|
-
remote_instance_weight_loader_seed_instance_ip: Optional[str] = None,
|
69
|
-
remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None,
|
70
|
-
remote_instance_weight_loader_send_weights_group_ports: Optional[
|
71
|
-
List[int]
|
72
|
-
] = None,
|
73
92
|
) -> None:
|
74
93
|
# Parse args
|
75
94
|
self.model_path = model_path
|
76
95
|
self.revision = revision
|
77
96
|
self.quantization = quantization
|
97
|
+
self.is_draft_model = is_draft_model
|
78
98
|
self.model_impl = model_impl
|
79
|
-
self.tp_rank = tp_rank
|
80
|
-
self.remote_instance_weight_loader_seed_instance_ip = (
|
81
|
-
remote_instance_weight_loader_seed_instance_ip
|
82
|
-
)
|
83
|
-
self.remote_instance_weight_loader_seed_instance_service_port = (
|
84
|
-
remote_instance_weight_loader_seed_instance_service_port
|
85
|
-
)
|
86
|
-
self.remote_instance_weight_loader_send_weights_group_ports = (
|
87
|
-
remote_instance_weight_loader_send_weights_group_ports
|
88
|
-
)
|
89
99
|
|
90
|
-
|
100
|
+
# Get hf config
|
101
|
+
self._maybe_pull_model_tokenizer_from_remote()
|
91
102
|
self.model_override_args = json.loads(model_override_args)
|
92
103
|
kwargs = {}
|
93
104
|
if override_config_file and override_config_file.strip():
|
94
105
|
kwargs["_configuration_file"] = override_config_file.strip()
|
95
|
-
|
96
106
|
self.hf_config = get_config(
|
97
107
|
self.model_path,
|
98
108
|
trust_remote_code=trust_remote_code,
|
@@ -100,7 +110,7 @@ class ModelConfig:
|
|
100
110
|
model_override_args=self.model_override_args,
|
101
111
|
**kwargs,
|
102
112
|
)
|
103
|
-
|
113
|
+
self.hf_text_config = get_hf_text_config(self.hf_config)
|
104
114
|
self.hf_generation_config = get_generation_config(
|
105
115
|
self.model_path,
|
106
116
|
trust_remote_code=trust_remote_code,
|
@@ -108,7 +118,25 @@ class ModelConfig:
|
|
108
118
|
**kwargs,
|
109
119
|
)
|
110
120
|
|
111
|
-
|
121
|
+
# Set enable_multimodal
|
122
|
+
if enable_multimodal is None:
|
123
|
+
mm_disabled_models = [
|
124
|
+
"Gemma3ForConditionalGeneration",
|
125
|
+
"Llama4ForConditionalGeneration",
|
126
|
+
"Step3VLForConditionalGeneration",
|
127
|
+
]
|
128
|
+
if self.hf_config.architectures[0] in mm_disabled_models:
|
129
|
+
enable_multimodal = False
|
130
|
+
logger.info(
|
131
|
+
f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
|
132
|
+
)
|
133
|
+
else:
|
134
|
+
enable_multimodal = True
|
135
|
+
|
136
|
+
# Config draft model
|
137
|
+
self._config_draft_model()
|
138
|
+
|
139
|
+
# Check model type
|
112
140
|
self.attention_chunk_size = getattr(
|
113
141
|
self.hf_text_config, "attention_chunk_size", None
|
114
142
|
)
|
@@ -124,20 +152,70 @@ class ModelConfig:
|
|
124
152
|
self.hf_config.architectures, self.hf_text_config.num_hidden_layers
|
125
153
|
)
|
126
154
|
)
|
155
|
+
self.is_generation = is_generation_model(
|
156
|
+
self.hf_config.architectures, is_embedding
|
157
|
+
)
|
158
|
+
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
159
|
+
self.hf_config.architectures
|
160
|
+
)
|
161
|
+
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
162
|
+
self.hf_config.architectures
|
163
|
+
)
|
164
|
+
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
165
|
+
self.hf_config.architectures
|
166
|
+
)
|
167
|
+
self.is_audio_model = enable_multimodal and is_audio_model(
|
168
|
+
self.hf_config.architectures
|
169
|
+
)
|
170
|
+
self.is_multimodal_chunked_prefill_supported = (
|
171
|
+
enable_multimodal
|
172
|
+
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
173
|
+
)
|
174
|
+
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
175
|
+
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
127
176
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
177
|
+
# Derive context length and model shapes
|
178
|
+
self._derive_context_length(context_length)
|
179
|
+
self._derive_model_shapes()
|
180
|
+
|
181
|
+
# Verify quantization
|
182
|
+
self._verify_quantization()
|
183
|
+
|
184
|
+
# Verify dual-chunk attention config
|
185
|
+
self._verify_dual_chunk_attention_config()
|
186
|
+
|
187
|
+
# Cache attributes
|
188
|
+
self.hf_eos_token_id = self._get_hf_eos_token_id()
|
189
|
+
|
190
|
+
# multimodal
|
191
|
+
self.image_token_id = getattr(
|
192
|
+
self.hf_config, "image_token_id", None
|
193
|
+
) or getattr(self.hf_config, "image_token_index", None)
|
194
|
+
|
195
|
+
@staticmethod
|
196
|
+
def from_server_args(
|
197
|
+
server_args: ServerArgs,
|
198
|
+
model_path: str = None,
|
199
|
+
model_revision: str = None,
|
200
|
+
**kwargs,
|
201
|
+
):
|
202
|
+
return ModelConfig(
|
203
|
+
model_path=model_path or server_args.model_path,
|
204
|
+
trust_remote_code=server_args.trust_remote_code,
|
205
|
+
revision=model_revision or server_args.revision,
|
206
|
+
context_length=server_args.context_length,
|
207
|
+
model_override_args=server_args.json_model_override_args,
|
208
|
+
is_embedding=server_args.is_embedding,
|
209
|
+
enable_multimodal=server_args.enable_multimodal,
|
210
|
+
dtype=server_args.dtype,
|
211
|
+
quantization=server_args.quantization,
|
212
|
+
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
213
|
+
model_impl=server_args.model_impl,
|
214
|
+
**kwargs,
|
215
|
+
)
|
216
|
+
|
217
|
+
def _config_draft_model(self):
|
218
|
+
is_draft_model = self.is_draft_model
|
141
219
|
|
142
220
|
if (
|
143
221
|
is_draft_model
|
@@ -172,31 +250,10 @@ class ModelConfig:
|
|
172
250
|
self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
|
173
251
|
self.hf_config.num_nextn_predict_layers = 1
|
174
252
|
|
175
|
-
|
176
|
-
|
177
|
-
self.hf_config.architectures, is_embedding
|
178
|
-
)
|
179
|
-
self.is_multimodal = enable_multimodal and is_multimodal_model(
|
180
|
-
self.hf_config.architectures
|
181
|
-
)
|
182
|
-
self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
|
183
|
-
self.hf_config.architectures
|
184
|
-
)
|
185
|
-
self.is_image_gen = enable_multimodal and is_image_gen_model(
|
186
|
-
self.hf_config.architectures
|
187
|
-
)
|
188
|
-
self.is_audio_model = enable_multimodal and is_audio_model(
|
189
|
-
self.hf_config.architectures
|
190
|
-
)
|
191
|
-
self.is_multimodal_chunked_prefill_supported = (
|
192
|
-
enable_multimodal
|
193
|
-
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
194
|
-
)
|
195
|
-
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
196
|
-
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
197
|
-
|
198
|
-
# Derive context length
|
253
|
+
def _derive_context_length(self, context_length: int):
|
254
|
+
is_draft_model = self.is_draft_model
|
199
255
|
derived_context_len = get_context_length(self.hf_text_config)
|
256
|
+
|
200
257
|
if context_length is not None:
|
201
258
|
if context_length > derived_context_len:
|
202
259
|
reason = "Target model's" if is_draft_model else "User-specified"
|
@@ -205,11 +262,16 @@ class ModelConfig:
|
|
205
262
|
f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
|
206
263
|
)
|
207
264
|
if (
|
208
|
-
|
265
|
+
envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
|
209
266
|
or is_in_ci() # FIXME: fix this special case
|
210
267
|
):
|
211
268
|
logger.warning(msg)
|
212
269
|
self.context_len = context_length
|
270
|
+
if is_draft_model:
|
271
|
+
self.hf_text_config.max_position_embeddings = context_length
|
272
|
+
logger.warning(
|
273
|
+
f"Overriding the draft model's max_position_embeddings to {context_length}."
|
274
|
+
)
|
213
275
|
else:
|
214
276
|
raise ValueError(
|
215
277
|
f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
|
@@ -219,6 +281,10 @@ class ModelConfig:
|
|
219
281
|
else:
|
220
282
|
self.context_len = derived_context_len
|
221
283
|
|
284
|
+
# Transfer context_len to HuggingFace config so models can access it
|
285
|
+
self.hf_config.context_len = self.context_len
|
286
|
+
|
287
|
+
def _derive_model_shapes(self):
|
222
288
|
# Unify the config keys for hf_text_config
|
223
289
|
self.head_dim = getattr(
|
224
290
|
self.hf_text_config,
|
@@ -229,6 +295,7 @@ class ModelConfig:
|
|
229
295
|
# FIXME: temporary special judge for MLA architecture
|
230
296
|
if (
|
231
297
|
"DeepseekV2ForCausalLM" in self.hf_config.architectures
|
298
|
+
or "DeepseekV32ForCausalLM" in self.hf_config.architectures
|
232
299
|
or "DeepseekV3ForCausalLM" in self.hf_config.architectures
|
233
300
|
or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
|
234
301
|
or "LongcatFlashForCausalLM" in self.hf_config.architectures
|
@@ -241,6 +308,11 @@ class ModelConfig:
|
|
241
308
|
self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
|
242
309
|
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
|
243
310
|
self.v_head_dim = self.hf_config.v_head_dim
|
311
|
+
self.index_head_dim = (
|
312
|
+
get_nsa_index_head_dim(self.hf_config)
|
313
|
+
if is_deepseek_nsa(self.hf_config)
|
314
|
+
else None
|
315
|
+
)
|
244
316
|
|
245
317
|
# Handle rope scaling with yarn
|
246
318
|
self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
|
@@ -313,45 +385,6 @@ class ModelConfig:
|
|
313
385
|
)
|
314
386
|
self.vocab_size = self.hf_text_config.vocab_size
|
315
387
|
|
316
|
-
# Verify quantization
|
317
|
-
self._verify_quantization()
|
318
|
-
|
319
|
-
# Verify dual-chunk attention config
|
320
|
-
self._verify_dual_chunk_attention_config()
|
321
|
-
|
322
|
-
# Cache attributes
|
323
|
-
self.hf_eos_token_id = self.get_hf_eos_token_id()
|
324
|
-
|
325
|
-
# multimodal
|
326
|
-
self.image_token_id = getattr(
|
327
|
-
self.hf_config, "image_token_id", None
|
328
|
-
) or getattr(self.hf_config, "image_token_index", None)
|
329
|
-
|
330
|
-
@staticmethod
|
331
|
-
def from_server_args(
|
332
|
-
server_args: ServerArgs,
|
333
|
-
model_path: str = None,
|
334
|
-
model_revision: str = None,
|
335
|
-
**kwargs,
|
336
|
-
):
|
337
|
-
return ModelConfig(
|
338
|
-
model_path=model_path or server_args.model_path,
|
339
|
-
trust_remote_code=server_args.trust_remote_code,
|
340
|
-
revision=model_revision or server_args.revision,
|
341
|
-
context_length=server_args.context_length,
|
342
|
-
model_override_args=server_args.json_model_override_args,
|
343
|
-
is_embedding=server_args.is_embedding,
|
344
|
-
enable_multimodal=server_args.enable_multimodal,
|
345
|
-
dtype=server_args.dtype,
|
346
|
-
quantization=server_args.quantization,
|
347
|
-
hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
|
348
|
-
model_impl=server_args.model_impl,
|
349
|
-
remote_instance_weight_loader_seed_instance_ip=server_args.remote_instance_weight_loader_seed_instance_ip,
|
350
|
-
remote_instance_weight_loader_seed_instance_service_port=server_args.remote_instance_weight_loader_seed_instance_service_port,
|
351
|
-
remote_instance_weight_loader_send_weights_group_ports=server_args.remote_instance_weight_loader_send_weights_group_ports,
|
352
|
-
**kwargs,
|
353
|
-
)
|
354
|
-
|
355
388
|
def get_total_num_attention_heads(self) -> int:
|
356
389
|
return self.num_attention_heads
|
357
390
|
|
@@ -452,13 +485,31 @@ class ModelConfig:
|
|
452
485
|
from huggingface_hub import HfApi
|
453
486
|
|
454
487
|
hf_api = HfApi()
|
455
|
-
|
488
|
+
|
489
|
+
def check_hf_quant_config():
|
490
|
+
return hf_api.file_exists(
|
491
|
+
self.model_path, "hf_quant_config.json"
|
492
|
+
)
|
493
|
+
|
494
|
+
# Retry HF API call up to 3 times
|
495
|
+
file_exists = retry(
|
496
|
+
check_hf_quant_config,
|
497
|
+
max_retry=2,
|
498
|
+
initial_delay=1.0,
|
499
|
+
max_delay=5.0,
|
500
|
+
)
|
501
|
+
|
502
|
+
if file_exists:
|
456
503
|
quant_cfg = modelopt_quant_config
|
504
|
+
|
457
505
|
except huggingface_hub.errors.OfflineModeIsEnabled:
|
458
506
|
logger.warning(
|
459
507
|
"Offline mode is enabled, skipping hf_quant_config.json check"
|
460
508
|
)
|
461
|
-
|
509
|
+
except Exception as e:
|
510
|
+
logger.warning(
|
511
|
+
f"Failed to check hf_quant_config.json: {self.model_path} {e}"
|
512
|
+
)
|
462
513
|
|
463
514
|
elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
|
464
515
|
quant_config_file = os.path.join(
|
@@ -586,7 +637,7 @@ class ModelConfig:
|
|
586
637
|
"sparse_attention_enabled"
|
587
638
|
] = True
|
588
639
|
|
589
|
-
def
|
640
|
+
def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
|
590
641
|
eos_ids = getattr(self.hf_config, "eos_token_id", None)
|
591
642
|
if eos_ids is not None:
|
592
643
|
# it can be either int or list of int
|
@@ -606,7 +657,7 @@ class ModelConfig:
|
|
606
657
|
eos_ids = eos_ids | generation_eos_ids
|
607
658
|
return eos_ids
|
608
659
|
|
609
|
-
def
|
660
|
+
def _maybe_pull_model_tokenizer_from_remote(self) -> None:
|
610
661
|
"""
|
611
662
|
Pull the model config files to a temporary
|
612
663
|
directory in case of remote.
|
@@ -749,6 +800,8 @@ multimodal_model_archs = [
|
|
749
800
|
"Qwen2AudioForConditionalGeneration",
|
750
801
|
"Qwen2VLForConditionalGeneration",
|
751
802
|
"Qwen2_5_VLForConditionalGeneration",
|
803
|
+
"Qwen3VLForConditionalGeneration",
|
804
|
+
"Qwen3VLMoeForConditionalGeneration",
|
752
805
|
"KimiVLForConditionalGeneration",
|
753
806
|
"InternVLChatModel",
|
754
807
|
"InternS1ForConditionalGeneration",
|
@@ -756,6 +809,8 @@ multimodal_model_archs = [
|
|
756
809
|
"VILAForConditionalGeneration",
|
757
810
|
"Step3VLForConditionalGeneration",
|
758
811
|
"DotsVLMForCausalLM",
|
812
|
+
"DotsOCRForCausalLM",
|
813
|
+
"Sarashina2VisionForCausalLM",
|
759
814
|
]
|
760
815
|
|
761
816
|
|