sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. sglang/bench_offline_throughput.py +10 -8
  2. sglang/bench_one_batch.py +7 -6
  3. sglang/bench_one_batch_server.py +157 -21
  4. sglang/bench_serving.py +137 -59
  5. sglang/compile_deep_gemm.py +5 -5
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +78 -78
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +2 -2
  11. sglang/srt/configs/model_config.py +40 -28
  12. sglang/srt/constrained/base_grammar_backend.py +55 -72
  13. sglang/srt/constrained/llguidance_backend.py +25 -21
  14. sglang/srt/constrained/outlines_backend.py +27 -26
  15. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  16. sglang/srt/constrained/xgrammar_backend.py +69 -43
  17. sglang/srt/conversation.py +49 -44
  18. sglang/srt/disaggregation/base/conn.py +1 -0
  19. sglang/srt/disaggregation/decode.py +129 -135
  20. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  21. sglang/srt/disaggregation/fake/conn.py +3 -13
  22. sglang/srt/disaggregation/kv_events.py +357 -0
  23. sglang/srt/disaggregation/mini_lb.py +57 -24
  24. sglang/srt/disaggregation/mooncake/conn.py +238 -122
  25. sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
  26. sglang/srt/disaggregation/nixl/conn.py +10 -19
  27. sglang/srt/disaggregation/prefill.py +132 -47
  28. sglang/srt/disaggregation/utils.py +123 -6
  29. sglang/srt/distributed/utils.py +3 -3
  30. sglang/srt/entrypoints/EngineBase.py +5 -0
  31. sglang/srt/entrypoints/engine.py +44 -9
  32. sglang/srt/entrypoints/http_server.py +23 -6
  33. sglang/srt/entrypoints/http_server_engine.py +5 -2
  34. sglang/srt/function_call/base_format_detector.py +250 -0
  35. sglang/srt/function_call/core_types.py +34 -0
  36. sglang/srt/function_call/deepseekv3_detector.py +157 -0
  37. sglang/srt/function_call/ebnf_composer.py +234 -0
  38. sglang/srt/function_call/function_call_parser.py +175 -0
  39. sglang/srt/function_call/llama32_detector.py +74 -0
  40. sglang/srt/function_call/mistral_detector.py +84 -0
  41. sglang/srt/function_call/pythonic_detector.py +163 -0
  42. sglang/srt/function_call/qwen25_detector.py +67 -0
  43. sglang/srt/function_call/utils.py +35 -0
  44. sglang/srt/hf_transformers_utils.py +46 -7
  45. sglang/srt/layers/attention/aiter_backend.py +513 -0
  46. sglang/srt/layers/attention/flashattention_backend.py +64 -18
  47. sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
  48. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  49. sglang/srt/layers/attention/triton_backend.py +3 -0
  50. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  51. sglang/srt/layers/attention/utils.py +6 -4
  52. sglang/srt/layers/attention/vision.py +1 -1
  53. sglang/srt/layers/communicator.py +451 -0
  54. sglang/srt/layers/dp_attention.py +61 -21
  55. sglang/srt/layers/layernorm.py +1 -1
  56. sglang/srt/layers/logits_processor.py +46 -11
  57. sglang/srt/layers/moe/cutlass_moe.py +207 -0
  58. sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
  59. sglang/srt/layers/moe/ep_moe/layer.py +105 -51
  60. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
  63. sglang/srt/layers/moe/topk.py +67 -10
  64. sglang/srt/layers/multimodal.py +70 -0
  65. sglang/srt/layers/quantization/__init__.py +8 -3
  66. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  67. sglang/srt/layers/quantization/deep_gemm.py +77 -74
  68. sglang/srt/layers/quantization/fp8.py +92 -2
  69. sglang/srt/layers/quantization/fp8_kernel.py +3 -3
  70. sglang/srt/layers/quantization/fp8_utils.py +6 -0
  71. sglang/srt/layers/quantization/gptq.py +298 -6
  72. sglang/srt/layers/quantization/int8_kernel.py +20 -7
  73. sglang/srt/layers/quantization/qoq.py +244 -0
  74. sglang/srt/layers/sampler.py +0 -4
  75. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  76. sglang/srt/lora/lora_manager.py +2 -4
  77. sglang/srt/lora/mem_pool.py +4 -4
  78. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  79. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  80. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  81. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  82. sglang/srt/lora/utils.py +1 -1
  83. sglang/srt/managers/data_parallel_controller.py +3 -3
  84. sglang/srt/managers/deepseek_eplb.py +278 -0
  85. sglang/srt/managers/detokenizer_manager.py +21 -8
  86. sglang/srt/managers/eplb_manager.py +55 -0
  87. sglang/srt/managers/expert_distribution.py +704 -56
  88. sglang/srt/managers/expert_location.py +394 -0
  89. sglang/srt/managers/expert_location_dispatch.py +91 -0
  90. sglang/srt/managers/io_struct.py +19 -4
  91. sglang/srt/managers/mm_utils.py +294 -140
  92. sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
  93. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  94. sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
  95. sglang/srt/managers/multimodal_processors/internvl.py +14 -5
  96. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  97. sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
  98. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  99. sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
  100. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  101. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  102. sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
  103. sglang/srt/managers/schedule_batch.py +122 -42
  104. sglang/srt/managers/schedule_policy.py +1 -5
  105. sglang/srt/managers/scheduler.py +205 -138
  106. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  107. sglang/srt/managers/session_controller.py +1 -1
  108. sglang/srt/managers/tokenizer_manager.py +232 -58
  109. sglang/srt/managers/tp_worker.py +12 -9
  110. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  111. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  112. sglang/srt/mem_cache/chunk_cache.py +3 -1
  113. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  114. sglang/srt/mem_cache/memory_pool.py +76 -52
  115. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  116. sglang/srt/mem_cache/radix_cache.py +58 -5
  117. sglang/srt/metrics/collector.py +314 -39
  118. sglang/srt/mm_utils.py +10 -0
  119. sglang/srt/model_executor/cuda_graph_runner.py +29 -19
  120. sglang/srt/model_executor/expert_location_updater.py +422 -0
  121. sglang/srt/model_executor/forward_batch_info.py +5 -1
  122. sglang/srt/model_executor/model_runner.py +163 -68
  123. sglang/srt/model_loader/loader.py +10 -6
  124. sglang/srt/models/clip.py +5 -1
  125. sglang/srt/models/deepseek_janus_pro.py +2 -2
  126. sglang/srt/models/deepseek_v2.py +308 -351
  127. sglang/srt/models/exaone.py +8 -3
  128. sglang/srt/models/gemma3_mm.py +70 -33
  129. sglang/srt/models/llama.py +2 -0
  130. sglang/srt/models/llama4.py +15 -8
  131. sglang/srt/models/llava.py +258 -7
  132. sglang/srt/models/mimo_mtp.py +220 -0
  133. sglang/srt/models/minicpmo.py +5 -12
  134. sglang/srt/models/mistral.py +71 -1
  135. sglang/srt/models/mixtral.py +98 -34
  136. sglang/srt/models/mllama.py +3 -3
  137. sglang/srt/models/pixtral.py +467 -0
  138. sglang/srt/models/qwen2.py +95 -26
  139. sglang/srt/models/qwen2_5_vl.py +8 -0
  140. sglang/srt/models/qwen2_moe.py +330 -60
  141. sglang/srt/models/qwen2_vl.py +6 -0
  142. sglang/srt/models/qwen3.py +52 -10
  143. sglang/srt/models/qwen3_moe.py +411 -48
  144. sglang/srt/models/roberta.py +1 -1
  145. sglang/srt/models/siglip.py +294 -0
  146. sglang/srt/models/torch_native_llama.py +1 -1
  147. sglang/srt/openai_api/adapter.py +58 -20
  148. sglang/srt/openai_api/protocol.py +6 -8
  149. sglang/srt/operations.py +154 -0
  150. sglang/srt/operations_strategy.py +31 -0
  151. sglang/srt/reasoning_parser.py +3 -3
  152. sglang/srt/sampling/custom_logit_processor.py +18 -3
  153. sglang/srt/sampling/sampling_batch_info.py +4 -56
  154. sglang/srt/sampling/sampling_params.py +2 -2
  155. sglang/srt/server_args.py +162 -22
  156. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  157. sglang/srt/speculative/eagle_utils.py +138 -7
  158. sglang/srt/speculative/eagle_worker.py +69 -21
  159. sglang/srt/utils.py +74 -17
  160. sglang/test/few_shot_gsm8k.py +2 -2
  161. sglang/test/few_shot_gsm8k_engine.py +2 -2
  162. sglang/test/run_eval.py +2 -2
  163. sglang/test/runners.py +8 -1
  164. sglang/test/send_one.py +13 -3
  165. sglang/test/simple_eval_common.py +1 -1
  166. sglang/test/simple_eval_humaneval.py +1 -1
  167. sglang/test/test_cutlass_moe.py +278 -0
  168. sglang/test/test_programs.py +5 -5
  169. sglang/test/test_utils.py +55 -14
  170. sglang/utils.py +3 -3
  171. sglang/version.py +1 -1
  172. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
  173. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
  174. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
  175. sglang/srt/function_call_parser.py +0 -858
  176. sglang/srt/platforms/interface.py +0 -371
  177. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  178. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  179. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
  180. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from dataclasses import dataclass
2
3
  from enum import Enum, auto
3
4
  from typing import Callable, Dict, List, Tuple
@@ -71,9 +72,9 @@ def get_chat_template(name):
71
72
 
72
73
  def get_chat_template_by_model_path(model_path):
73
74
  for matching_func in matching_function_registry:
74
- template = matching_func(model_path)
75
- if template is not None:
76
- return template
75
+ template_name = matching_func(model_path)
76
+ if template_name is not None:
77
+ return get_chat_template(template_name)
77
78
  return get_chat_template("default")
78
79
 
79
80
 
@@ -193,6 +194,21 @@ register_chat_template(
193
194
  )
194
195
  )
195
196
 
197
+ # Reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
198
+ register_chat_template(
199
+ ChatTemplate(
200
+ name="mistral",
201
+ default_system_prompt=None,
202
+ role_prefix_and_suffix={
203
+ "system": ("[SYSTEM_PROMPT] ", " [/SYSTEM_PROMPT]"),
204
+ "user": ("[INST] ", " [/INST]"),
205
+ "assistant": ("", " </s><s>"),
206
+ },
207
+ stop_str=("</s>",),
208
+ image_token="[IMG]",
209
+ )
210
+ )
211
+
196
212
  register_chat_template(
197
213
  ChatTemplate(
198
214
  name="llama-3-instruct",
@@ -479,134 +495,118 @@ register_chat_template(
479
495
 
480
496
  @register_chat_template_matching_function
481
497
  def match_deepseek(model_path: str):
482
- if (
483
- "deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
484
- ) and "base" not in model_path.lower():
485
- return get_chat_template("deepseek-v3")
498
+ if re.search(r"deepseek-(v3|r1)", model_path, re.IGNORECASE) and not re.search(
499
+ r"base", model_path, re.IGNORECASE
500
+ ):
501
+ return "deepseek-v3"
486
502
 
487
503
 
488
504
  @register_chat_template_matching_function
489
505
  def match_deepseek_janus_pro(model_path: str):
490
- if "janus" in model_path.lower():
491
- return get_chat_template("janus-pro")
506
+ if re.search(r"janus", model_path, re.IGNORECASE):
507
+ return "janus-pro"
492
508
 
493
509
 
494
510
  @register_chat_template_matching_function
495
511
  def match_dbrx(model_path: str):
496
- if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
497
- return get_chat_template("dbrx-instruct")
512
+ if re.search(r"dbrx", model_path, re.IGNORECASE) and re.search(
513
+ r"instruct", model_path, re.IGNORECASE
514
+ ):
515
+ return "dbrx-instruct"
498
516
 
499
517
 
500
518
  @register_chat_template_matching_function
501
519
  def match_vicuna(model_path: str):
502
- if "vicuna" in model_path.lower():
503
- return get_chat_template("vicuna_v1.1")
504
- if "llava-v1.5" in model_path.lower():
505
- return get_chat_template("vicuna_v1.1")
506
- if "llava-next-video-7b" in model_path.lower():
507
- return get_chat_template("vicuna_v1.1")
520
+ if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
521
+ return "vicuna_v1.1"
508
522
 
509
523
 
510
524
  @register_chat_template_matching_function
511
525
  def match_llama2_chat(model_path: str):
512
- model_path = model_path.lower()
513
- if "llama-2" in model_path and "chat" in model_path:
514
- return get_chat_template("llama-2-chat")
515
- if (
516
- "mistral" in model_path or "mixtral" in model_path
517
- ) and "instruct" in model_path:
518
- return get_chat_template("llama-2-chat")
519
- if "codellama" in model_path and "instruct" in model_path:
520
- return get_chat_template("llama-2-chat")
526
+ if re.search(
527
+ r"llama-2.*chat|codellama.*instruct",
528
+ model_path,
529
+ re.IGNORECASE,
530
+ ):
531
+ return "llama-2-chat"
532
+
533
+
534
+ @register_chat_template_matching_function
535
+ def match_mistral(model_path: str):
536
+ if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
537
+ return "mistral"
521
538
 
522
539
 
523
540
  @register_chat_template_matching_function
524
541
  def match_llama3_instruct(model_path: str):
525
- model_path = model_path.lower()
526
- if "llama-3" in model_path and "instruct" in model_path:
527
- return get_chat_template("llama-3-instruct")
542
+ if re.search(r"llama-3.*instruct", model_path, re.IGNORECASE):
543
+ return "llama-3-instruct"
528
544
 
529
545
 
530
546
  @register_chat_template_matching_function
531
547
  def match_chat_ml(model_path: str):
532
- # import pdb;pdb.set_trace()
533
- model_path = model_path.lower()
534
- if "tinyllama" in model_path:
535
- return get_chat_template("chatml")
536
- # Now the suffix for qwen2 chat model is "instruct"
537
- if "qwen" in model_path and "vl" in model_path:
538
- return get_chat_template("qwen2-vl")
539
- if "qwen" in model_path:
540
- if "vl" in model_path:
541
- return get_chat_template("qwen2-vl")
542
- if ("chat" in model_path or "instruct" in model_path) and (
543
- "llava" not in model_path
544
- ):
545
- return get_chat_template("qwen")
546
- if (
547
- "llava-v1.6-34b" in model_path
548
- or "llava-v1.6-yi-34b" in model_path
549
- or "llava-next-video-34b" in model_path
550
- or "llava-onevision-qwen2" in model_path
548
+ if re.search(r"tinyllama", model_path, re.IGNORECASE):
549
+ return "chatml"
550
+ if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
551
+ return "qwen2-vl"
552
+ if re.search(r"qwen.*(chat|instruct)", model_path, re.IGNORECASE) and not re.search(
553
+ r"llava", model_path, re.IGNORECASE
551
554
  ):
552
- return get_chat_template("chatml-llava")
555
+ return "qwen"
556
+ if re.search(
557
+ r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
558
+ model_path,
559
+ re.IGNORECASE,
560
+ ):
561
+ return "chatml-llava"
553
562
 
554
563
 
555
564
  @register_chat_template_matching_function
556
565
  def match_chat_yi(model_path: str):
557
- model_path = model_path.lower()
558
- if "yi-vl" in model_path and "llava" not in model_path:
559
- return get_chat_template("yi-vl")
560
- elif "yi-1.5" in model_path and "chat" in model_path:
561
- return get_chat_template("yi-1.5")
566
+ if re.search(r"yi-vl", model_path, re.IGNORECASE) and not re.search(
567
+ r"llava", model_path, re.IGNORECASE
568
+ ):
569
+ return "yi-vl"
570
+ elif re.search(r"yi-1\.5.*chat", model_path, re.IGNORECASE):
571
+ return "yi-1.5"
562
572
 
563
573
 
564
574
  @register_chat_template_matching_function
565
575
  def match_gemma_it(model_path: str):
566
- model_path = model_path.lower()
567
- if "gemma" in model_path and "it" in model_path:
568
- return get_chat_template("gemma-it")
576
+ if re.search(r"gemma.*it", model_path, re.IGNORECASE):
577
+ return "gemma-it"
569
578
 
570
579
 
571
580
  @register_chat_template_matching_function
572
581
  def match_openbmb_minicpm(model_path: str):
573
- model_path = model_path.lower()
574
- if "minicpm-v" in model_path:
575
- return get_chat_template("minicpmv")
576
- elif "minicpm-o" in model_path:
577
- return get_chat_template("minicpmo")
582
+ if re.search(r"minicpm-v", model_path, re.IGNORECASE):
583
+ return "minicpmv"
584
+ elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
585
+ return "minicpmo"
578
586
 
579
587
 
580
588
  @register_chat_template_matching_function
581
589
  def match_c4ai_command_r(model_path: str):
582
- model_path = model_path.lower()
583
- if "c4ai-command-r" in model_path:
584
- return get_chat_template("c4ai-command-r")
590
+ if re.search(r"c4ai-command-r", model_path, re.IGNORECASE):
591
+ return "c4ai-command-r"
585
592
 
586
593
 
587
594
  @register_chat_template_matching_function
588
595
  def match_granite_instruct(model_path: str):
589
- model_path = model_path.lower()
590
- # When future versions of Granite are released, this code may
591
- # need to be updated. For now, assume that the Granite 3.0
592
- # template works across the board.
593
- if "granite" in model_path and "instruct" in model_path:
594
- return get_chat_template("granite-3-instruct")
596
+ if re.search(r"granite.*instruct", model_path, re.IGNORECASE):
597
+ return "granite-3-instruct"
595
598
 
596
599
 
597
600
  @register_chat_template_matching_function
598
601
  def match_gemma3_instruct(model_path: str):
599
- model_path = model_path.lower()
600
- if "gemma-3" in model_path and "1b" not in model_path:
601
- # gemma-3-1b-it is completion model
602
- return get_chat_template("gemma-it")
602
+ if re.search(r"gemma-3", model_path, re.IGNORECASE):
603
+ return "gemma-it"
603
604
 
604
605
 
605
606
  @register_chat_template_matching_function
606
607
  def match_internvl_chat(model_path: str):
607
- model_path = model_path.lower()
608
- if "internvl" in model_path:
609
- return get_chat_template("internvl-2-5")
608
+ if re.search(r"internvl2_5", model_path, re.IGNORECASE):
609
+ return "internvl-2-5"
610
610
 
611
611
 
612
612
  if __name__ == "__main__":
sglang/lang/tracer.py CHANGED
@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
38
38
  with TracingScope(tracer):
39
39
  tracer.ret_value = program.func(tracer, **arguments)
40
40
  except (StopTracing, TypeError, AttributeError):
41
- # Some exceptions may not be catched
41
+ # Some exceptions may not be caught
42
42
  pass
43
43
 
44
44
  # Run and cache prefix
@@ -27,7 +27,7 @@ completion_template_name = None
27
27
 
28
28
 
29
29
  class FimPosition:
30
- """Postion of fim middle token."""
30
+ """Position of fim middle token."""
31
31
 
32
32
  MIDDLE = auto()
33
33
  END = auto()
@@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
416
416
  h = w = math.ceil(
417
417
  (self.image_size // self.patch_size) / self.downsample_ratio
418
418
  )
419
- # global views tokens h * (w + 1), 1 is for line seperator
419
+ # global views tokens h * (w + 1), 1 is for line separator
420
420
  tokenized_image = [self.image_token_id] * h * (w + 1)
421
- # add a seperator between global and local views
421
+ # add a separator between global and local views
422
422
  tokenized_image += [self.image_token_id]
423
423
  # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
424
424
  tokenized_image += (
@@ -22,7 +22,11 @@ from typing import List, Optional, Set, Union
22
22
  import torch
23
23
  from transformers import PretrainedConfig
24
24
 
25
- from sglang.srt.hf_transformers_utils import get_config, get_context_length
25
+ from sglang.srt.hf_transformers_utils import (
26
+ get_config,
27
+ get_context_length,
28
+ get_hf_text_config,
29
+ )
26
30
  from sglang.srt.layers.quantization import QUANTIZATION_METHODS
27
31
  from sglang.srt.server_args import ServerArgs
28
32
  from sglang.srt.utils import get_bool_env_var, is_hip
@@ -69,6 +73,7 @@ class ModelConfig:
69
73
  model_override_args=self.model_override_args,
70
74
  **kwargs,
71
75
  )
76
+
72
77
  self.hf_text_config = get_hf_text_config(self.hf_config)
73
78
  self.attention_chunk_size = getattr(
74
79
  self.hf_text_config, "attention_chunk_size", None
@@ -93,6 +98,8 @@ class ModelConfig:
93
98
  ):
94
99
  self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
95
100
 
101
+ if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
102
+ self.hf_config.architectures[0] = "MiMoMTP"
96
103
  # Check model type
97
104
  self.is_generation = is_generation_model(
98
105
  self.hf_config.architectures, is_embedding
@@ -109,6 +116,10 @@ class ModelConfig:
109
116
  self.is_audio_model = enable_multimodal and is_audio_model(
110
117
  self.hf_config.architectures
111
118
  )
119
+ self.is_multimodal_chunked_prefill_supported = (
120
+ enable_multimodal
121
+ and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
122
+ )
112
123
  self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
113
124
  self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
114
125
 
@@ -209,7 +220,13 @@ class ModelConfig:
209
220
 
210
221
  # Cache attributes
211
222
  self.hf_eos_token_id = self.get_hf_eos_token_id()
212
- self.image_token_id = getattr(self.hf_config, "image_token_id", None)
223
+
224
+ config = self.hf_config
225
+
226
+ # multimodal
227
+ self.image_token_id = getattr(config, "image_token_id", None) or getattr(
228
+ config, "image_token_index", None
229
+ )
213
230
 
214
231
  @staticmethod
215
232
  def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
@@ -332,6 +349,7 @@ class ModelConfig:
332
349
  "w8a8_int8",
333
350
  "w8a8_fp8",
334
351
  "moe_wna16",
352
+ "qoq",
335
353
  ]
336
354
  compatible_quantization_methods = {
337
355
  "modelopt_fp4": ["modelopt"],
@@ -423,31 +441,6 @@ class ModelConfig:
423
441
  self.model_path = client.get_local_dir()
424
442
 
425
443
 
426
- def get_hf_text_config(config: PretrainedConfig):
427
- """Get the "sub" config relevant to llm for multi modal models.
428
- No op for pure text models.
429
- """
430
- class_name = config.architectures[0]
431
- if class_name.startswith("Llava") and class_name.endswith("ForCausalLM"):
432
- # We support non-hf version of llava models, so we do not want to
433
- # read the wrong values from the unused default text_config.
434
- # NOTE(HandH1998): We set `torch_dtype` of config to `torch.float16` for the weights, as
435
- # `torch.float16` is default used for image features in `python/sglang/srt/models/llava.py`.
436
- setattr(config, "torch_dtype", torch.float16)
437
- return config
438
-
439
- if hasattr(config, "text_config"):
440
- # The code operates under the assumption that text_config should have
441
- # `num_attention_heads` (among others). Assert here to fail early
442
- # if transformers config doesn't align with this assumption.
443
- assert hasattr(config.text_config, "num_attention_heads")
444
- return config.text_config
445
- if hasattr(config, "language_config"):
446
- return config.language_config
447
- else:
448
- return config
449
-
450
-
451
444
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
452
445
  _STR_DTYPE_TO_TORCH_DTYPE = {
453
446
  "half": torch.float16,
@@ -466,6 +459,8 @@ def _get_and_verify_dtype(
466
459
  # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
467
460
  # because config.torch_dtype can be None.
468
461
  config_dtype = getattr(config, "torch_dtype", None)
462
+ if isinstance(config_dtype, str):
463
+ config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
469
464
  if config_dtype is None:
470
465
  config_dtype = torch.float32
471
466
 
@@ -537,6 +532,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
537
532
 
538
533
 
539
534
  multimodal_model_archs = [
535
+ "CLIPModel",
540
536
  "DeepseekVL2ForCausalLM",
541
537
  "Gemma3ForConditionalGeneration",
542
538
  "Grok1VForCausalLM",
@@ -545,14 +541,15 @@ multimodal_model_archs = [
545
541
  "Llama4ForConditionalGeneration",
546
542
  "LlavaMistralForCausalLM",
547
543
  "LlavaQwenForCausalLM",
544
+ "LlavaForConditionalGeneration",
548
545
  "LlavaVidForCausalLM",
549
546
  "MiniCPMO",
550
547
  "MiniCPMV",
548
+ "Mistral3ForConditionalGeneration",
551
549
  "MultiModalityCausalLM",
552
550
  "MllamaForConditionalGeneration",
553
551
  "Qwen2VLForConditionalGeneration",
554
552
  "Qwen2_5_VLForConditionalGeneration",
555
- "CLIPModel",
556
553
  "KimiVLForConditionalGeneration",
557
554
  "InternVLChatModel",
558
555
  ]
@@ -584,6 +581,21 @@ def is_encoder_decoder_model(model_architectures: List[str]):
584
581
  return "MllamaForConditionalGeneration" in model_architectures
585
582
 
586
583
 
584
+ def is_multimodal_chunked_prefill_supported(model_architectures: List[str]):
585
+ """Check if chunked prefill is supported for a MultiModal model."""
586
+ unsupported = [
587
+ "Grok1VForCausalLM",
588
+ "Grok1AForCausalLM",
589
+ "LlavaLlamaForCausalLM",
590
+ "MllamaForConditionalGeneration",
591
+ "CLIPModel",
592
+ ]
593
+ if any(multi_model_arch in unsupported for multi_model_arch in model_architectures):
594
+ return False
595
+ else:
596
+ return True
597
+
598
+
587
599
  def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
588
600
  if scale <= 1:
589
601
  return 1.0
@@ -14,10 +14,9 @@
14
14
  """The baseclass of a backend for grammar-guided constrained decoding."""
15
15
 
16
16
  import logging
17
- from abc import ABC, abstractmethod
18
- from concurrent.futures import Future, ThreadPoolExecutor
17
+ from concurrent.futures import ThreadPoolExecutor
19
18
  from dataclasses import dataclass
20
- from threading import Event, Lock
19
+ from threading import Event
21
20
  from typing import Dict, List, Optional, Tuple
22
21
 
23
22
  import torch
@@ -27,11 +26,42 @@ from sglang.srt.server_args import ServerArgs
27
26
  logger = logging.getLogger(__name__)
28
27
 
29
28
 
30
- class BaseGrammarObject(ABC):
29
+ class BaseGrammarObject:
31
30
 
32
31
  def __init__(self):
33
32
  self._finished = False
34
33
 
34
+ def accept_token(self, token: int) -> None:
35
+ """
36
+ Accept a token in the grammar.
37
+ """
38
+ raise NotImplementedError()
39
+
40
+ def rollback(self, k: int):
41
+ raise NotImplementedError()
42
+
43
+ def is_terminated(self):
44
+ return False
45
+
46
+ def allocate_vocab_mask(
47
+ self, vocab_size: int, batch_size: int, device
48
+ ) -> torch.Tensor:
49
+ raise NotImplementedError()
50
+
51
+ def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
52
+ raise NotImplementedError()
53
+
54
+ @staticmethod
55
+ def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
56
+ raise NotImplementedError()
57
+
58
+ @staticmethod
59
+ def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
60
+ raise NotImplementedError()
61
+
62
+ def copy(self) -> "BaseGrammarObject":
63
+ raise NotImplementedError()
64
+
35
65
  @property
36
66
  def finished(self):
37
67
  return self._finished
@@ -40,7 +70,6 @@ class BaseGrammarObject(ABC):
40
70
  def finished(self, finished):
41
71
  self._finished = finished
42
72
 
43
- @abstractmethod
44
73
  def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
45
74
  """
46
75
  Try to jump forward in the grammar.
@@ -49,9 +78,8 @@ class BaseGrammarObject(ABC):
49
78
  A jump forward helper which may be used in `jump_forward_str_state`.
50
79
  None if the jump forward is not possible.
51
80
  """
52
- raise NotImplementedError
81
+ raise NotImplementedError()
53
82
 
54
- @abstractmethod
55
83
  def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
56
84
  """
57
85
  Jump forward for the grammar.
@@ -60,47 +88,15 @@ class BaseGrammarObject(ABC):
60
88
  A tuple of the jump forward string and the next state of the grammar
61
89
  (which can be used in `jump_and_retokenize` if needed).
62
90
  """
63
- raise NotImplementedError
91
+ raise NotImplementedError()
64
92
 
65
- @abstractmethod
66
93
  def jump_and_retokenize(
67
94
  self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
68
95
  ) -> None:
69
96
  """
70
97
  Jump forward occurs, and update the grammar state if needed.
71
98
  """
72
- raise NotImplementedError
73
-
74
- @abstractmethod
75
- def accept_token(self, token: int) -> None:
76
- """
77
- Accept a token in the grammar.
78
- """
79
- raise NotImplementedError
80
-
81
- @abstractmethod
82
- def allocate_vocab_mask(
83
- self, vocab_size: int, batch_size: int, device
84
- ) -> torch.Tensor:
85
- raise NotImplementedError
86
-
87
- @abstractmethod
88
- def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
89
- raise NotImplementedError
90
-
91
- @staticmethod
92
- @abstractmethod
93
- def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
94
- raise NotImplementedError
95
-
96
- @staticmethod
97
- @abstractmethod
98
- def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
99
- raise NotImplementedError
100
-
101
- @abstractmethod
102
- def copy(self) -> "BaseGrammarObject":
103
- raise NotImplementedError
99
+ raise NotImplementedError()
104
100
 
105
101
 
106
102
  @dataclass
@@ -113,10 +109,9 @@ class BaseGrammarBackend:
113
109
  def __init__(self):
114
110
  self.executor = ThreadPoolExecutor()
115
111
  self.cache: Dict[Tuple[str, str], CacheEntry] = {}
116
- self.cache_lock = Lock()
117
112
 
118
113
  def _not_supported(self, key_type: str, key_string: str) -> None:
119
- logger.warning(f"Skip unsupported {key_type}: {key_type}={key_string}")
114
+ logger.warning(f"Skip unsupported {key_type=}, {key_string=}")
120
115
 
121
116
  def dispatch_fallback(
122
117
  self, key_type: str, key_string: str
@@ -148,40 +143,25 @@ class BaseGrammarBackend:
148
143
  return self.dispatch_ebnf(key_string)
149
144
  elif key_type == "structural_tag":
150
145
  return self.dispatch_structural_tag(key_string)
146
+ elif key_type == "structural_pattern":
147
+ return self.dispatch_structural_pattern(key_string)
151
148
  else:
152
149
  return self.dispatch_fallback(key_type, key_string)
153
150
 
154
- def _init_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
155
- with self.cache_lock:
156
- if key in self.cache:
157
- cache_hit = True
158
- entry = self.cache[key]
159
- else:
160
- cache_hit = False
161
- entry = CacheEntry(None, Event())
162
- self.cache[key] = entry
163
-
164
- if cache_hit:
165
- entry.event.wait()
166
- else:
167
- entry.value = self._init_value_dispatch(key)
168
- entry.event.set()
169
- return entry.value.copy() if entry.value else None
170
-
171
- def get_cached_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
172
- with self.cache_lock:
173
- entry = self.cache.get(key)
174
- if not entry or not entry.event.is_set():
175
- return None
176
- val = self.cache[key].value
177
- return val.copy() if val else None
151
+ def get_cached_or_future_value(
152
+ self, key: Tuple[str, str]
153
+ ) -> Optional[BaseGrammarObject]:
154
+ value = self.cache.get(key)
155
+ if value:
156
+ return value.copy(), True
157
+ value = self.executor.submit(self._init_value_dispatch, key)
158
+ return value, False
178
159
 
179
- def get_future_value(self, key: Tuple[str, str]) -> Future:
180
- return self.executor.submit(self._init_value, key)
160
+ def set_cache(self, key: Tuple[str, str], value: BaseGrammarObject):
161
+ self.cache[key] = value
181
162
 
182
163
  def reset(self):
183
- with self.cache_lock:
184
- self.cache.clear()
164
+ self.cache.clear()
185
165
 
186
166
 
187
167
  def create_grammar_backend(
@@ -211,9 +191,12 @@ def create_grammar_backend(
211
191
  raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
212
192
 
213
193
  if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
214
- from .reasoner_grammar_backend import ReasonerGrammarBackend
194
+ from sglang.srt.constrained.reasoner_grammar_backend import (
195
+ ReasonerGrammarBackend,
196
+ )
215
197
 
216
198
  grammar_backend = ReasonerGrammarBackend(
217
199
  grammar_backend, tokenizer.think_end_id
218
200
  )
201
+
219
202
  return grammar_backend