sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -8
- sglang/bench_one_batch.py +7 -6
- sglang/bench_one_batch_server.py +157 -21
- sglang/bench_serving.py +137 -59
- sglang/compile_deep_gemm.py +5 -5
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +78 -78
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +2 -2
- sglang/srt/configs/model_config.py +40 -28
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -43
- sglang/srt/conversation.py +49 -44
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +129 -135
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +3 -13
- sglang/srt/disaggregation/kv_events.py +357 -0
- sglang/srt/disaggregation/mini_lb.py +57 -24
- sglang/srt/disaggregation/mooncake/conn.py +238 -122
- sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
- sglang/srt/disaggregation/nixl/conn.py +10 -19
- sglang/srt/disaggregation/prefill.py +132 -47
- sglang/srt/disaggregation/utils.py +123 -6
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +5 -0
- sglang/srt/entrypoints/engine.py +44 -9
- sglang/srt/entrypoints/http_server.py +23 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +250 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +157 -0
- sglang/srt/function_call/ebnf_composer.py +234 -0
- sglang/srt/function_call/function_call_parser.py +175 -0
- sglang/srt/function_call/llama32_detector.py +74 -0
- sglang/srt/function_call/mistral_detector.py +84 -0
- sglang/srt/function_call/pythonic_detector.py +163 -0
- sglang/srt/function_call/qwen25_detector.py +67 -0
- sglang/srt/function_call/utils.py +35 -0
- sglang/srt/hf_transformers_utils.py +46 -7
- sglang/srt/layers/attention/aiter_backend.py +513 -0
- sglang/srt/layers/attention/flashattention_backend.py +64 -18
- sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/triton_backend.py +3 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/utils.py +6 -4
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +451 -0
- sglang/srt/layers/dp_attention.py +61 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/cutlass_moe.py +207 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
- sglang/srt/layers/moe/ep_moe/layer.py +105 -51
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
- sglang/srt/layers/moe/topk.py +67 -10
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +8 -3
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +77 -74
- sglang/srt/layers/quantization/fp8.py +92 -2
- sglang/srt/layers/quantization/fp8_kernel.py +3 -3
- sglang/srt/layers/quantization/fp8_utils.py +6 -0
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +20 -7
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +2 -4
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/deepseek_eplb.py +278 -0
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/eplb_manager.py +55 -0
- sglang/srt/managers/expert_distribution.py +704 -56
- sglang/srt/managers/expert_location.py +394 -0
- sglang/srt/managers/expert_location_dispatch.py +91 -0
- sglang/srt/managers/io_struct.py +19 -4
- sglang/srt/managers/mm_utils.py +294 -140
- sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
- sglang/srt/managers/multimodal_processors/internvl.py +14 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
- sglang/srt/managers/schedule_batch.py +122 -42
- sglang/srt/managers/schedule_policy.py +1 -5
- sglang/srt/managers/scheduler.py +205 -138
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +232 -58
- sglang/srt/managers/tp_worker.py +12 -9
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +76 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +314 -39
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +29 -19
- sglang/srt/model_executor/expert_location_updater.py +422 -0
- sglang/srt/model_executor/forward_batch_info.py +5 -1
- sglang/srt/model_executor/model_runner.py +163 -68
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_janus_pro.py +2 -2
- sglang/srt/models/deepseek_v2.py +308 -351
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_mm.py +70 -33
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llama4.py +15 -8
- sglang/srt/models/llava.py +258 -7
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +5 -12
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/qwen2.py +95 -26
- sglang/srt/models/qwen2_5_vl.py +8 -0
- sglang/srt/models/qwen2_moe.py +330 -60
- sglang/srt/models/qwen2_vl.py +6 -0
- sglang/srt/models/qwen3.py +52 -10
- sglang/srt/models/qwen3_moe.py +411 -48
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/openai_api/adapter.py +58 -20
- sglang/srt/openai_api/protocol.py +6 -8
- sglang/srt/operations.py +154 -0
- sglang/srt/operations_strategy.py +31 -0
- sglang/srt/reasoning_parser.py +3 -3
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +4 -56
- sglang/srt/sampling/sampling_params.py +2 -2
- sglang/srt/server_args.py +162 -22
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +138 -7
- sglang/srt/speculative/eagle_worker.py +69 -21
- sglang/srt/utils.py +74 -17
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +55 -14
- sglang/utils.py +3 -3
- sglang/version.py +1 -1
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0
sglang/lang/chat_template.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import re
|
1
2
|
from dataclasses import dataclass
|
2
3
|
from enum import Enum, auto
|
3
4
|
from typing import Callable, Dict, List, Tuple
|
@@ -71,9 +72,9 @@ def get_chat_template(name):
|
|
71
72
|
|
72
73
|
def get_chat_template_by_model_path(model_path):
|
73
74
|
for matching_func in matching_function_registry:
|
74
|
-
|
75
|
-
if
|
76
|
-
return
|
75
|
+
template_name = matching_func(model_path)
|
76
|
+
if template_name is not None:
|
77
|
+
return get_chat_template(template_name)
|
77
78
|
return get_chat_template("default")
|
78
79
|
|
79
80
|
|
@@ -193,6 +194,21 @@ register_chat_template(
|
|
193
194
|
)
|
194
195
|
)
|
195
196
|
|
197
|
+
# Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
|
198
|
+
register_chat_template(
|
199
|
+
ChatTemplate(
|
200
|
+
name="mistral",
|
201
|
+
default_system_prompt=None,
|
202
|
+
role_prefix_and_suffix={
|
203
|
+
"system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
|
204
|
+
"user": ("[INST] ", " [/INST]"),
|
205
|
+
"assistant": ("", " </s><s>"),
|
206
|
+
},
|
207
|
+
stop_str=("</s>",),
|
208
|
+
image_token="[IMG]",
|
209
|
+
)
|
210
|
+
)
|
211
|
+
|
196
212
|
register_chat_template(
|
197
213
|
ChatTemplate(
|
198
214
|
name="llama-3-instruct",
|
@@ -479,134 +495,118 @@ register_chat_template(
|
|
479
495
|
|
480
496
|
@register_chat_template_matching_function
|
481
497
|
def match_deepseek(model_path: str):
|
482
|
-
if (
|
483
|
-
"
|
484
|
-
)
|
485
|
-
return
|
498
|
+
if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
|
499
|
+
r"base", model_path, re.IGNORECASE
|
500
|
+
):
|
501
|
+
return "deepseek-v3"
|
486
502
|
|
487
503
|
|
488
504
|
@register_chat_template_matching_function
|
489
505
|
def match_deepseek_janus_pro(model_path: str):
|
490
|
-
if "janus"
|
491
|
-
return
|
506
|
+
if re.search(r"janus", model_path, re.IGNORECASE):
|
507
|
+
return "janus-pro"
|
492
508
|
|
493
509
|
|
494
510
|
@register_chat_template_matching_function
|
495
511
|
def match_dbrx(model_path: str):
|
496
|
-
if "dbrx"
|
497
|
-
|
512
|
+
if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
|
513
|
+
r"instruct", model_path, re.IGNORECASE
|
514
|
+
):
|
515
|
+
return "dbrx-instruct"
|
498
516
|
|
499
517
|
|
500
518
|
@register_chat_template_matching_function
|
501
519
|
def match_vicuna(model_path: str):
|
502
|
-
if "vicuna"
|
503
|
-
return
|
504
|
-
if "llava-v1.5" in model_path.lower():
|
505
|
-
return get_chat_template("vicuna_v1.1")
|
506
|
-
if "llava-next-video-7b" in model_path.lower():
|
507
|
-
return get_chat_template("vicuna_v1.1")
|
520
|
+
if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
|
521
|
+
return "vicuna_v1.1"
|
508
522
|
|
509
523
|
|
510
524
|
@register_chat_template_matching_function
|
511
525
|
def match_llama2_chat(model_path: str):
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
526
|
+
if re.search(
|
527
|
+
r"llama-2.*chat|codellama.*instruct",
|
528
|
+
model_path,
|
529
|
+
re.IGNORECASE,
|
530
|
+
):
|
531
|
+
return "llama-2-chat"
|
532
|
+
|
533
|
+
|
534
|
+
@register_chat_template_matching_function
|
535
|
+
def match_mistral(model_path: str):
|
536
|
+
if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
|
537
|
+
return "mistral"
|
521
538
|
|
522
539
|
|
523
540
|
@register_chat_template_matching_function
|
524
541
|
def match_llama3_instruct(model_path: str):
|
525
|
-
|
526
|
-
|
527
|
-
return get_chat_template("llama-3-instruct")
|
542
|
+
if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
|
543
|
+
return "llama-3-instruct"
|
528
544
|
|
529
545
|
|
530
546
|
@register_chat_template_matching_function
|
531
547
|
def match_chat_ml(model_path: str):
|
532
|
-
|
533
|
-
|
534
|
-
if "
|
535
|
-
return
|
536
|
-
|
537
|
-
|
538
|
-
return get_chat_template("qwen2-vl")
|
539
|
-
if "qwen" in model_path:
|
540
|
-
if "vl" in model_path:
|
541
|
-
return get_chat_template("qwen2-vl")
|
542
|
-
if ("chat" in model_path or "instruct" in model_path) and (
|
543
|
-
"llava" not in model_path
|
544
|
-
):
|
545
|
-
return get_chat_template("qwen")
|
546
|
-
if (
|
547
|
-
"llava-v1.6-34b" in model_path
|
548
|
-
or "llava-v1.6-yi-34b" in model_path
|
549
|
-
or "llava-next-video-34b" in model_path
|
550
|
-
or "llava-onevision-qwen2" in model_path
|
548
|
+
if re.search(r"tinyllama", model_path, re.IGNORECASE):
|
549
|
+
return "chatml"
|
550
|
+
if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
|
551
|
+
return "qwen2-vl"
|
552
|
+
if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
|
553
|
+
r"llava", model_path, re.IGNORECASE
|
551
554
|
):
|
552
|
-
return
|
555
|
+
return "qwen"
|
556
|
+
if re.search(
|
557
|
+
r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
|
558
|
+
model_path,
|
559
|
+
re.IGNORECASE,
|
560
|
+
):
|
561
|
+
return "chatml-llava"
|
553
562
|
|
554
563
|
|
555
564
|
@register_chat_template_matching_function
|
556
565
|
def match_chat_yi(model_path: str):
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
566
|
+
if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
|
567
|
+
r"llava", model_path, re.IGNORECASE
|
568
|
+
):
|
569
|
+
return "yi-vl"
|
570
|
+
elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
|
571
|
+
return "yi-1.5"
|
562
572
|
|
563
573
|
|
564
574
|
@register_chat_template_matching_function
|
565
575
|
def match_gemma_it(model_path: str):
|
566
|
-
|
567
|
-
|
568
|
-
return get_chat_template("gemma-it")
|
576
|
+
if re.search(r"gemma.*it", model_path, re.IGNORECASE):
|
577
|
+
return "gemma-it"
|
569
578
|
|
570
579
|
|
571
580
|
@register_chat_template_matching_function
|
572
581
|
def match_openbmb_minicpm(model_path: str):
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
return get_chat_template("minicpmo")
|
582
|
+
if re.search(r"minicpm-v", model_path, re.IGNORECASE):
|
583
|
+
return "minicpmv"
|
584
|
+
elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
|
585
|
+
return "minicpmo"
|
578
586
|
|
579
587
|
|
580
588
|
@register_chat_template_matching_function
|
581
589
|
def match_c4ai_command_r(model_path: str):
|
582
|
-
|
583
|
-
|
584
|
-
return get_chat_template("c4ai-command-r")
|
590
|
+
if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
|
591
|
+
return "c4ai-command-r"
|
585
592
|
|
586
593
|
|
587
594
|
@register_chat_template_matching_function
|
588
595
|
def match_granite_instruct(model_path: str):
|
589
|
-
|
590
|
-
|
591
|
-
# need to be updated. For now, assume that the Granite 3.0
|
592
|
-
# template works across the board.
|
593
|
-
if "granite" in model_path and "instruct" in model_path:
|
594
|
-
return get_chat_template("granite-3-instruct")
|
596
|
+
if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
|
597
|
+
return "granite-3-instruct"
|
595
598
|
|
596
599
|
|
597
600
|
@register_chat_template_matching_function
|
598
601
|
def match_gemma3_instruct(model_path: str):
|
599
|
-
|
600
|
-
|
601
|
-
# gemma-3-1b-it is completion model
|
602
|
-
return get_chat_template("gemma-it")
|
602
|
+
if re.search(r"gemma-3", model_path, re.IGNORECASE):
|
603
|
+
return "gemma-it"
|
603
604
|
|
604
605
|
|
605
606
|
@register_chat_template_matching_function
|
606
607
|
def match_internvl_chat(model_path: str):
|
607
|
-
|
608
|
-
|
609
|
-
return get_chat_template("internvl-2-5")
|
608
|
+
if re.search(r"internvl2_5", model_path, re.IGNORECASE):
|
609
|
+
return "internvl-2-5"
|
610
610
|
|
611
611
|
|
612
612
|
if __name__ == "__main__":
|
sglang/lang/tracer.py
CHANGED
@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
|
|
38
38
|
with TracingScope(tracer):
|
39
39
|
tracer.ret_value = program.func(tracer, **arguments)
|
40
40
|
except (StopTracing, TypeError, AttributeError):
|
41
|
-
# Some exceptions may not be
|
41
|
+
# Some exceptions may not be caught
|
42
42
|
pass
|
43
43
|
|
44
44
|
# Run and cache prefix
|
@@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
|
416
416
|
h = w = math.ceil(
|
417
417
|
(self.image_size // self.patch_size) / self.downsample_ratio
|
418
418
|
)
|
419
|
-
# global views tokens h * (w + 1), 1 is for line
|
419
|
+
# global views tokens h * (w + 1), 1 is for line separator
|
420
420
|
tokenized_image = [self.image_token_id] * h * (w + 1)
|
421
|
-
# add a
|
421
|
+
# add a separator between global and local views
|
422
422
|
tokenized_image += [self.image_token_id]
|
423
423
|
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
424
424
|
tokenized_image += (
|
@@ -22,7 +22,11 @@ from typing import List, Optional, Set, Union
|
|
22
22
|
import torch
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
|
25
|
-
from sglang.srt.hf_transformers_utils import
|
25
|
+
from sglang.srt.hf_transformers_utils import (
|
26
|
+
get_config,
|
27
|
+
get_context_length,
|
28
|
+
get_hf_text_config,
|
29
|
+
)
|
26
30
|
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
27
31
|
from sglang.srt.server_args import ServerArgs
|
28
32
|
from sglang.srt.utils import get_bool_env_var, is_hip
|
@@ -69,6 +73,7 @@ class ModelConfig:
|
|
69
73
|
model_override_args=self.model_override_args,
|
70
74
|
**kwargs,
|
71
75
|
)
|
76
|
+
|
72
77
|
self.hf_text_config = get_hf_text_config(self.hf_config)
|
73
78
|
self.attention_chunk_size = getattr(
|
74
79
|
self.hf_text_config, "attention_chunk_size", None
|
@@ -93,6 +98,8 @@ class ModelConfig:
|
|
93
98
|
):
|
94
99
|
self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
|
95
100
|
|
101
|
+
if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
|
102
|
+
self.hf_config.architectures[0] = "MiMoMTP"
|
96
103
|
# Check model type
|
97
104
|
self.is_generation = is_generation_model(
|
98
105
|
self.hf_config.architectures, is_embedding
|
@@ -109,6 +116,10 @@ class ModelConfig:
|
|
109
116
|
self.is_audio_model = enable_multimodal and is_audio_model(
|
110
117
|
self.hf_config.architectures
|
111
118
|
)
|
119
|
+
self.is_multimodal_chunked_prefill_supported = (
|
120
|
+
enable_multimodal
|
121
|
+
and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
|
122
|
+
)
|
112
123
|
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
113
124
|
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
|
114
125
|
|
@@ -209,7 +220,13 @@ class ModelConfig:
|
|
209
220
|
|
210
221
|
# Cache attributes
|
211
222
|
self.hf_eos_token_id = self.get_hf_eos_token_id()
|
212
|
-
|
223
|
+
|
224
|
+
config = self.hf_config
|
225
|
+
|
226
|
+
# multimodal
|
227
|
+
self.image_token_id = getattr(config, "image_token_id", None) or getattr(
|
228
|
+
config, "image_token_index", None
|
229
|
+
)
|
213
230
|
|
214
231
|
@staticmethod
|
215
232
|
def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
|
@@ -332,6 +349,7 @@ class ModelConfig:
|
|
332
349
|
"w8a8_int8",
|
333
350
|
"w8a8_fp8",
|
334
351
|
"moe_wna16",
|
352
|
+
"qoq",
|
335
353
|
]
|
336
354
|
compatible_quantization_methods = {
|
337
355
|
"modelopt_fp4": ["modelopt"],
|
@@ -423,31 +441,6 @@ class ModelConfig:
|
|
423
441
|
self.model_path = client.get_local_dir()
|
424
442
|
|
425
443
|
|
426
|
-
def get_hf_text_config(config: PretrainedConfig):
|
427
|
-
"""Get the "sub" config relevant to llm for multi modal models.
|
428
|
-
No op for pure text models.
|
429
|
-
"""
|
430
|
-
class_name = config.architectures[0]
|
431
|
-
if class_name.startswith("Llava") and class_name.endswith("ForCausalLM"):
|
432
|
-
# We support non-hf version of llava models, so we do not want to
|
433
|
-
# read the wrong values from the unused default text_config.
|
434
|
-
# NOTE(HandH1998): We set `torch_dtype` of config to `torch.float16` for the weights, as
|
435
|
-
# `torch.float16` is default used for image features in `python/sglang/srt/models/llava.py`.
|
436
|
-
setattr(config, "torch_dtype", torch.float16)
|
437
|
-
return config
|
438
|
-
|
439
|
-
if hasattr(config, "text_config"):
|
440
|
-
# The code operates under the assumption that text_config should have
|
441
|
-
# `num_attention_heads` (among others). Assert here to fail early
|
442
|
-
# if transformers config doesn't align with this assumption.
|
443
|
-
assert hasattr(config.text_config, "num_attention_heads")
|
444
|
-
return config.text_config
|
445
|
-
if hasattr(config, "language_config"):
|
446
|
-
return config.language_config
|
447
|
-
else:
|
448
|
-
return config
|
449
|
-
|
450
|
-
|
451
444
|
# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
|
452
445
|
_STR_DTYPE_TO_TORCH_DTYPE = {
|
453
446
|
"half": torch.float16,
|
@@ -466,6 +459,8 @@ def _get_and_verify_dtype(
|
|
466
459
|
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
467
460
|
# because config.torch_dtype can be None.
|
468
461
|
config_dtype = getattr(config, "torch_dtype", None)
|
462
|
+
if isinstance(config_dtype, str):
|
463
|
+
config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
|
469
464
|
if config_dtype is None:
|
470
465
|
config_dtype = torch.float32
|
471
466
|
|
@@ -537,6 +532,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
|
|
537
532
|
|
538
533
|
|
539
534
|
multimodal_model_archs = [
|
535
|
+
"CLIPModel",
|
540
536
|
"DeepseekVL2ForCausalLM",
|
541
537
|
"Gemma3ForConditionalGeneration",
|
542
538
|
"Grok1VForCausalLM",
|
@@ -545,14 +541,15 @@ multimodal_model_archs = [
|
|
545
541
|
"Llama4ForConditionalGeneration",
|
546
542
|
"LlavaMistralForCausalLM",
|
547
543
|
"LlavaQwenForCausalLM",
|
544
|
+
"LlavaForConditionalGeneration",
|
548
545
|
"LlavaVidForCausalLM",
|
549
546
|
"MiniCPMO",
|
550
547
|
"MiniCPMV",
|
548
|
+
"Mistral3ForConditionalGeneration",
|
551
549
|
"MultiModalityCausalLM",
|
552
550
|
"MllamaForConditionalGeneration",
|
553
551
|
"Qwen2VLForConditionalGeneration",
|
554
552
|
"Qwen2_5_VLForConditionalGeneration",
|
555
|
-
"CLIPModel",
|
556
553
|
"KimiVLForConditionalGeneration",
|
557
554
|
"InternVLChatModel",
|
558
555
|
]
|
@@ -584,6 +581,21 @@ def is_encoder_decoder_model(model_architectures: List[str]):
|
|
584
581
|
return "MllamaForConditionalGeneration" in model_architectures
|
585
582
|
|
586
583
|
|
584
|
+
def is_multimodal_chunked_prefill_supported(model_architectures: List[str]):
|
585
|
+
"""Check if chunked prefill is supported for a MultiModal model."""
|
586
|
+
unsupported = [
|
587
|
+
"Grok1VForCausalLM",
|
588
|
+
"Grok1AForCausalLM",
|
589
|
+
"LlavaLlamaForCausalLM",
|
590
|
+
"MllamaForConditionalGeneration",
|
591
|
+
"CLIPModel",
|
592
|
+
]
|
593
|
+
if any(multi_model_arch in unsupported for multi_model_arch in model_architectures):
|
594
|
+
return False
|
595
|
+
else:
|
596
|
+
return True
|
597
|
+
|
598
|
+
|
587
599
|
def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
|
588
600
|
if scale <= 1:
|
589
601
|
return 1.0
|
@@ -14,10 +14,9 @@
|
|
14
14
|
"""The baseclass of a backend for grammar-guided constrained decoding."""
|
15
15
|
|
16
16
|
import logging
|
17
|
-
from
|
18
|
-
from concurrent.futures import Future, ThreadPoolExecutor
|
17
|
+
from concurrent.futures import ThreadPoolExecutor
|
19
18
|
from dataclasses import dataclass
|
20
|
-
from threading import Event
|
19
|
+
from threading import Event
|
21
20
|
from typing import Dict, List, Optional, Tuple
|
22
21
|
|
23
22
|
import torch
|
@@ -27,11 +26,42 @@ from sglang.srt.server_args import ServerArgs
|
|
27
26
|
logger = logging.getLogger(__name__)
|
28
27
|
|
29
28
|
|
30
|
-
class BaseGrammarObject
|
29
|
+
class BaseGrammarObject:
|
31
30
|
|
32
31
|
def __init__(self):
|
33
32
|
self._finished = False
|
34
33
|
|
34
|
+
def accept_token(self, token: int) -> None:
|
35
|
+
"""
|
36
|
+
Accept a token in the grammar.
|
37
|
+
"""
|
38
|
+
raise NotImplementedError()
|
39
|
+
|
40
|
+
def rollback(self, k: int):
|
41
|
+
raise NotImplementedError()
|
42
|
+
|
43
|
+
def is_terminated(self):
|
44
|
+
return False
|
45
|
+
|
46
|
+
def allocate_vocab_mask(
|
47
|
+
self, vocab_size: int, batch_size: int, device
|
48
|
+
) -> torch.Tensor:
|
49
|
+
raise NotImplementedError()
|
50
|
+
|
51
|
+
def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
|
52
|
+
raise NotImplementedError()
|
53
|
+
|
54
|
+
@staticmethod
|
55
|
+
def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
|
56
|
+
raise NotImplementedError()
|
57
|
+
|
58
|
+
@staticmethod
|
59
|
+
def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
|
60
|
+
raise NotImplementedError()
|
61
|
+
|
62
|
+
def copy(self) -> "BaseGrammarObject":
|
63
|
+
raise NotImplementedError()
|
64
|
+
|
35
65
|
@property
|
36
66
|
def finished(self):
|
37
67
|
return self._finished
|
@@ -40,7 +70,6 @@ class BaseGrammarObject(ABC):
|
|
40
70
|
def finished(self, finished):
|
41
71
|
self._finished = finished
|
42
72
|
|
43
|
-
@abstractmethod
|
44
73
|
def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
|
45
74
|
"""
|
46
75
|
Try to jump forward in the grammar.
|
@@ -49,9 +78,8 @@ class BaseGrammarObject(ABC):
|
|
49
78
|
A jump forward helper which may be used in `jump_forward_str_state`.
|
50
79
|
None if the jump forward is not possible.
|
51
80
|
"""
|
52
|
-
raise NotImplementedError
|
81
|
+
raise NotImplementedError()
|
53
82
|
|
54
|
-
@abstractmethod
|
55
83
|
def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
|
56
84
|
"""
|
57
85
|
Jump forward for the grammar.
|
@@ -60,47 +88,15 @@ class BaseGrammarObject(ABC):
|
|
60
88
|
A tuple of the jump forward string and the next state of the grammar
|
61
89
|
(which can be used in `jump_and_retokenize` if needed).
|
62
90
|
"""
|
63
|
-
raise NotImplementedError
|
91
|
+
raise NotImplementedError()
|
64
92
|
|
65
|
-
@abstractmethod
|
66
93
|
def jump_and_retokenize(
|
67
94
|
self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
|
68
95
|
) -> None:
|
69
96
|
"""
|
70
97
|
Jump forward occurs, and update the grammar state if needed.
|
71
98
|
"""
|
72
|
-
raise NotImplementedError
|
73
|
-
|
74
|
-
@abstractmethod
|
75
|
-
def accept_token(self, token: int) -> None:
|
76
|
-
"""
|
77
|
-
Accept a token in the grammar.
|
78
|
-
"""
|
79
|
-
raise NotImplementedError
|
80
|
-
|
81
|
-
@abstractmethod
|
82
|
-
def allocate_vocab_mask(
|
83
|
-
self, vocab_size: int, batch_size: int, device
|
84
|
-
) -> torch.Tensor:
|
85
|
-
raise NotImplementedError
|
86
|
-
|
87
|
-
@abstractmethod
|
88
|
-
def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
|
89
|
-
raise NotImplementedError
|
90
|
-
|
91
|
-
@staticmethod
|
92
|
-
@abstractmethod
|
93
|
-
def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
|
94
|
-
raise NotImplementedError
|
95
|
-
|
96
|
-
@staticmethod
|
97
|
-
@abstractmethod
|
98
|
-
def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
|
99
|
-
raise NotImplementedError
|
100
|
-
|
101
|
-
@abstractmethod
|
102
|
-
def copy(self) -> "BaseGrammarObject":
|
103
|
-
raise NotImplementedError
|
99
|
+
raise NotImplementedError()
|
104
100
|
|
105
101
|
|
106
102
|
@dataclass
|
@@ -113,10 +109,9 @@ class BaseGrammarBackend:
|
|
113
109
|
def __init__(self):
|
114
110
|
self.executor = ThreadPoolExecutor()
|
115
111
|
self.cache: Dict[Tuple[str, str], CacheEntry] = {}
|
116
|
-
self.cache_lock = Lock()
|
117
112
|
|
118
113
|
def _not_supported(self, key_type: str, key_string: str) -> None:
|
119
|
-
logger.warning(f"Skip unsupported {key_type}
|
114
|
+
logger.warning(f"Skip unsupported {key_type=}, {key_string=}")
|
120
115
|
|
121
116
|
def dispatch_fallback(
|
122
117
|
self, key_type: str, key_string: str
|
@@ -148,40 +143,25 @@ class BaseGrammarBackend:
|
|
148
143
|
return self.dispatch_ebnf(key_string)
|
149
144
|
elif key_type == "structural_tag":
|
150
145
|
return self.dispatch_structural_tag(key_string)
|
146
|
+
elif key_type == "structural_pattern":
|
147
|
+
return self.dispatch_structural_pattern(key_string)
|
151
148
|
else:
|
152
149
|
return self.dispatch_fallback(key_type, key_string)
|
153
150
|
|
154
|
-
def
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
self.cache[key] = entry
|
163
|
-
|
164
|
-
if cache_hit:
|
165
|
-
entry.event.wait()
|
166
|
-
else:
|
167
|
-
entry.value = self._init_value_dispatch(key)
|
168
|
-
entry.event.set()
|
169
|
-
return entry.value.copy() if entry.value else None
|
170
|
-
|
171
|
-
def get_cached_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
|
172
|
-
with self.cache_lock:
|
173
|
-
entry = self.cache.get(key)
|
174
|
-
if not entry or not entry.event.is_set():
|
175
|
-
return None
|
176
|
-
val = self.cache[key].value
|
177
|
-
return val.copy() if val else None
|
151
|
+
def get_cached_or_future_value(
|
152
|
+
self, key: Tuple[str, str]
|
153
|
+
) -> Optional[BaseGrammarObject]:
|
154
|
+
value = self.cache.get(key)
|
155
|
+
if value:
|
156
|
+
return value.copy(), True
|
157
|
+
value = self.executor.submit(self._init_value_dispatch, key)
|
158
|
+
return value, False
|
178
159
|
|
179
|
-
def
|
180
|
-
|
160
|
+
def set_cache(self, key: Tuple[str, str], value: BaseGrammarObject):
|
161
|
+
self.cache[key] = value
|
181
162
|
|
182
163
|
def reset(self):
|
183
|
-
|
184
|
-
self.cache.clear()
|
164
|
+
self.cache.clear()
|
185
165
|
|
186
166
|
|
187
167
|
def create_grammar_backend(
|
@@ -211,9 +191,12 @@ def create_grammar_backend(
|
|
211
191
|
raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
|
212
192
|
|
213
193
|
if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
|
214
|
-
from .reasoner_grammar_backend import
|
194
|
+
from sglang.srt.constrained.reasoner_grammar_backend import (
|
195
|
+
ReasonerGrammarBackend,
|
196
|
+
)
|
215
197
|
|
216
198
|
grammar_backend = ReasonerGrammarBackend(
|
217
199
|
grammar_backend, tokenizer.think_end_id
|
218
200
|
)
|
201
|
+
|
219
202
|
return grammar_backend
|