sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sglang/bench_one_batch.py +2 -0
  2. sglang/check_env.py +3 -3
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/kimi_vl.py +38 -0
  5. sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  6. sglang/srt/configs/model_config.py +15 -0
  7. sglang/srt/conversation.py +122 -1
  8. sglang/srt/entrypoints/engine.py +44 -22
  9. sglang/srt/function_call_parser.py +97 -0
  10. sglang/srt/hf_transformers_utils.py +2 -0
  11. sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
  12. sglang/srt/layers/attention/flashinfer_backend.py +107 -82
  13. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
  14. sglang/srt/layers/attention/flashmla_backend.py +3 -0
  15. sglang/srt/layers/dp_attention.py +5 -2
  16. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  21. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
  22. sglang/srt/layers/quantization/__init__.py +2 -2
  23. sglang/srt/layers/quantization/deep_gemm.py +1 -1
  24. sglang/srt/layers/utils.py +35 -0
  25. sglang/srt/lora/layers.py +35 -9
  26. sglang/srt/lora/lora_manager.py +84 -35
  27. sglang/srt/managers/data_parallel_controller.py +52 -34
  28. sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  29. sglang/srt/managers/schedule_batch.py +25 -15
  30. sglang/srt/managers/scheduler.py +263 -59
  31. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
  32. sglang/srt/managers/tp_worker.py +51 -16
  33. sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
  34. sglang/srt/mem_cache/memory_pool.py +70 -36
  35. sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  36. sglang/srt/model_executor/forward_batch_info.py +31 -1
  37. sglang/srt/model_executor/model_runner.py +115 -57
  38. sglang/srt/models/deepseek_nextn.py +1 -257
  39. sglang/srt/models/deepseek_v2.py +78 -18
  40. sglang/srt/models/kimi_vl.py +308 -0
  41. sglang/srt/models/kimi_vl_moonvit.py +639 -0
  42. sglang/srt/models/llama.py +92 -30
  43. sglang/srt/models/llama4.py +2 -1
  44. sglang/srt/models/llama_eagle.py +4 -1
  45. sglang/srt/models/llama_eagle3.py +4 -1
  46. sglang/srt/models/qwen2_moe.py +8 -3
  47. sglang/srt/models/qwen2_vl.py +0 -12
  48. sglang/srt/models/qwen3_moe.py +8 -3
  49. sglang/srt/openai_api/adapter.py +34 -22
  50. sglang/srt/openai_api/protocol.py +11 -1
  51. sglang/srt/server_args.py +67 -22
  52. sglang/srt/speculative/eagle_worker.py +3 -2
  53. sglang/srt/utils.py +88 -9
  54. sglang/test/runners.py +4 -0
  55. sglang/test/test_utils.py +29 -0
  56. sglang/version.py +1 -1
  57. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
  58. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +61 -51
  59. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
  60. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
  61. {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -154,6 +154,8 @@ def load_model(server_args, port_args, tp_rank):
154
154
  gpu_id=tp_rank,
155
155
  tp_rank=tp_rank,
156
156
  tp_size=server_args.tp_size,
157
+ pp_rank=0,
158
+ pp_size=1,
157
159
  nccl_port=port_args.nccl_port,
158
160
  server_args=server_args,
159
161
  )
sglang/check_env.py CHANGED
@@ -20,7 +20,7 @@ def is_cuda_v2():
20
20
  PACKAGE_LIST = [
21
21
  "sglang",
22
22
  "sgl_kernel",
23
- "flashinfer",
23
+ "flashinfer_python",
24
24
  "triton",
25
25
  "transformers",
26
26
  "torchao",
@@ -36,8 +36,8 @@ PACKAGE_LIST = [
36
36
  "packaging",
37
37
  "psutil",
38
38
  "pydantic",
39
- "multipart",
40
- "zmq",
39
+ "python-multipart",
40
+ "pyzmq",
41
41
  "torchao",
42
42
  "uvicorn",
43
43
  "uvloop",
@@ -3,6 +3,8 @@ from sglang.srt.configs.dbrx import DbrxConfig
3
3
  from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
4
4
  from sglang.srt.configs.exaone import ExaoneConfig
5
5
  from sglang.srt.configs.janus_pro import MultiModalityConfig
6
+ from sglang.srt.configs.kimi_vl import KimiVLConfig
7
+ from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
6
8
 
7
9
  __all__ = [
8
10
  "ExaoneConfig",
@@ -10,4 +12,6 @@ __all__ = [
10
12
  "DbrxConfig",
11
13
  "DeepseekVL2Config",
12
14
  "MultiModalityConfig",
15
+ "KimiVLConfig",
16
+ "MoonViTConfig",
13
17
  ]
@@ -0,0 +1,38 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
3
+ from typing import Optional, Union
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+ from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
8
+ from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
9
+
10
+
11
+ class KimiVLConfig(PretrainedConfig):
12
+ model_type = "kimi_vl"
13
+
14
+ def __init__(
15
+ self,
16
+ vision_config: Optional[Union[dict, MoonViTConfig]] = None,
17
+ text_config: Optional[Union[dict, DeepseekV2Config]] = None,
18
+ ignore_index: int = -100,
19
+ media_placeholder_token_id: int = 163605,
20
+ pad_token_id: int = 0,
21
+ **kwargs
22
+ ):
23
+ if vision_config is None:
24
+ vision_config = MoonViTConfig()
25
+ elif isinstance(vision_config, dict):
26
+ vision_config = MoonViTConfig(**vision_config)
27
+ self.vision_config = vision_config
28
+
29
+ if text_config is None:
30
+ text_config = DeepseekV2Config()
31
+ elif isinstance(text_config, dict):
32
+ text_config = DeepseekV2Config(**text_config)
33
+ self.text_config = text_config
34
+
35
+ self.ignore_index = ignore_index
36
+ self.media_placeholder_token_id = media_placeholder_token_id
37
+
38
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -0,0 +1,32 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ class MoonViTConfig(PretrainedConfig):
7
+ model_type = "moonvit"
8
+
9
+ def __init__(
10
+ self,
11
+ patch_size: int = 14,
12
+ init_pos_emb_height: int = 64,
13
+ init_pos_emb_width: int = 64,
14
+ num_attention_heads: int = 16,
15
+ num_hidden_layers: int = 27,
16
+ hidden_size: int = 1152,
17
+ intermediate_size: int = 4304,
18
+ merge_kernel_size: tuple[int, int] = (2, 2),
19
+ **kwargs,
20
+ ):
21
+ super().__init__(**kwargs)
22
+ self.patch_size = patch_size
23
+ # Positional embedding config
24
+ self.init_pos_emb_height = init_pos_emb_height
25
+ self.init_pos_emb_width = init_pos_emb_width
26
+ # Transformer config
27
+ self.num_hidden_layers = num_hidden_layers
28
+ self.num_attention_heads = num_attention_heads
29
+ self.hidden_size = hidden_size
30
+ self.intermediate_size = intermediate_size
31
+ # Patch merger config
32
+ self.merge_kernel_size = merge_kernel_size
@@ -47,6 +47,7 @@ class ModelConfig:
47
47
  dtype: str = "auto",
48
48
  quantization: Optional[str] = None,
49
49
  override_config_file: Optional[str] = None,
50
+ is_draft_model: bool = False,
50
51
  ) -> None:
51
52
 
52
53
  self.model_path = model_path
@@ -85,6 +86,12 @@ class ModelConfig:
85
86
  else:
86
87
  enable_multimodal = True
87
88
 
89
+ if (
90
+ is_draft_model
91
+ and self.hf_config.architectures[0] == "DeepseekV3ForCausalLM"
92
+ ):
93
+ self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
94
+
88
95
  # Check model type
89
96
  self.is_generation = is_generation_model(
90
97
  self.hf_config.architectures, is_embedding
@@ -169,6 +176,13 @@ class ModelConfig:
169
176
  self.attention_arch = AttentionArch.MLA
170
177
  self.kv_lora_rank = self.hf_text_config.kv_lora_rank
171
178
  self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
179
+ elif "KimiVLForConditionalGeneration" in self.hf_config.architectures:
180
+ self.head_dim = 256
181
+ self.attention_arch = AttentionArch.MLA
182
+ self.kv_lora_rank = self.hf_text_config.kv_lora_rank
183
+ self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
184
+ self.v_head_dim = self.hf_text_config.v_head_dim
185
+ self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
172
186
  else:
173
187
  self.attention_arch = AttentionArch.MHA
174
188
 
@@ -523,6 +537,7 @@ multimodal_model_archs = [
523
537
  "Qwen2VLForConditionalGeneration",
524
538
  "Qwen2_5_VLForConditionalGeneration",
525
539
  "CLIPModel",
540
+ "KimiVLForConditionalGeneration",
526
541
  ]
527
542
 
528
543
 
@@ -17,7 +17,7 @@
17
17
  # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
18
18
  import dataclasses
19
19
  from enum import IntEnum, auto
20
- from typing import Dict, List, Optional, Tuple, Union
20
+ from typing import Callable, Dict, List, Optional, Tuple, Union
21
21
 
22
22
  from sglang.srt.openai_api.protocol import ChatCompletionRequest
23
23
 
@@ -407,6 +407,7 @@ class Conversation:
407
407
 
408
408
  # A global registry for all conversation templates
409
409
  chat_templates: Dict[str, Conversation] = {}
410
+ matching_function_registry: List[Callable] = []
410
411
 
411
412
 
412
413
  def register_conv_template(template: Conversation, override: bool = False):
@@ -419,6 +420,18 @@ def register_conv_template(template: Conversation, override: bool = False):
419
420
  chat_templates[template.name] = template
420
421
 
421
422
 
423
+ def register_conv_template_matching_function(func):
424
+ matching_function_registry.append(func)
425
+
426
+
427
+ def get_conv_template_by_model_path(model_path):
428
+ for matching_func in matching_function_registry:
429
+ conv_name = matching_func(model_path)
430
+ if conv_name is not None:
431
+ return conv_name
432
+ return None
433
+
434
+
422
435
  def chat_template_exists(template_name: str) -> bool:
423
436
  return template_name in chat_templates
424
437
 
@@ -792,3 +805,111 @@ register_conv_template(
792
805
  audio_token="(<audio>./</audio>)",
793
806
  )
794
807
  )
808
+
809
+ # Reference: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/chat_template.jinja
810
+ register_conv_template(
811
+ Conversation(
812
+ name="kimi-vl",
813
+ system_message="You are a helpful assistant",
814
+ system_template="<|im_system|>system<|im_middle|>{system_message}",
815
+ roles=(
816
+ "<|im_user|>user<|im_middle|>",
817
+ "<|im_assistant|>assistant<|im_middle|>",
818
+ ),
819
+ messages=[],
820
+ sep="<|im_end|>",
821
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
822
+ stop_str="<|im_end|>",
823
+ image_token="<|media_start|>image<|media_content|><|media_pad|><|media_end|>",
824
+ )
825
+ )
826
+
827
+
828
+ @register_conv_template_matching_function
829
+ def match_deepseek_janus_pro(model_path: str):
830
+ if (
831
+ "llama" in model_path.lower()
832
+ and "3.2" in model_path.lower()
833
+ and "vision" in model_path.lower()
834
+ ):
835
+ return "llama_3_vision"
836
+
837
+
838
+ @register_conv_template_matching_function
839
+ def match_deepseek_janus_pro(model_path: str):
840
+ if "janus" in model_path.lower():
841
+ return "janus-pro"
842
+
843
+
844
+ @register_conv_template_matching_function
845
+ def match_vicuna(model_path: str):
846
+ if "vicuna" in model_path.lower():
847
+ return "vicuna_v1.1"
848
+ if "llava-v1.5" in model_path.lower():
849
+ return "vicuna_v1.1"
850
+ if "llava-next-video-7b" in model_path.lower():
851
+ return "vicuna_v1.1"
852
+
853
+
854
+ @register_conv_template_matching_function
855
+ def match_llama2_chat(model_path: str):
856
+ model_path = model_path.lower()
857
+ if "llama-2" in model_path and "chat" in model_path:
858
+ return "llama-2"
859
+ if (
860
+ "mistral" in model_path or "mixtral" in model_path
861
+ ) and "instruct" in model_path:
862
+ return "llama-2"
863
+ if "codellama" in model_path and "instruct" in model_path:
864
+ return "llama-2"
865
+
866
+
867
+ @register_conv_template_matching_function
868
+ def match_deepseek_vl(model_path: str):
869
+ model_path = model_path.lower()
870
+ if "deepseek" in model_path and "vl2" in model_path:
871
+ return "deepseek-vl2"
872
+
873
+
874
+ @register_conv_template_matching_function
875
+ def match_chat_ml(model_path: str):
876
+ # import pdb;pdb.set_trace()
877
+ model_path = model_path.lower()
878
+ # Now the suffix for qwen2 chat model is "instruct"
879
+ if "gme" in model_path and "qwen" in model_path and "vl" in model_path:
880
+ return "gme-qwen2-vl"
881
+ if "qwen" in model_path and "vl" in model_path:
882
+ return "qwen2-vl"
883
+ if (
884
+ "llava-v1.6-34b" in model_path
885
+ or "llava-v1.6-yi-34b" in model_path
886
+ or "llava-next-video-34b" in model_path
887
+ or "llava-onevision-qwen2" in model_path
888
+ ):
889
+ return "chatml-llava"
890
+
891
+
892
+ @register_conv_template_matching_function
893
+ def match_gemma_it(model_path: str):
894
+ model_path = model_path.lower()
895
+ if "gemma" in model_path and "it" in model_path:
896
+ return "gemma-it"
897
+ if "gemma-3" in model_path and "1b" not in model_path:
898
+ # gemma-3-1b-it is completion model
899
+ return "gemma-it"
900
+
901
+
902
+ @register_conv_template_matching_function
903
+ def match_openbmb_minicpm(model_path: str):
904
+ model_path = model_path.lower()
905
+ if "minicpm-v" in model_path:
906
+ return "minicpmv"
907
+ elif "minicpm-o" in model_path:
908
+ return "minicpmo"
909
+
910
+
911
+ @register_conv_template_matching_function
912
+ def match_moonshot_kimivl(model_path: str):
913
+ model_path = model_path.lower()
914
+ if "kimi" in model_path and "vl" in model_path:
915
+ return "kimi-vl"
@@ -58,7 +58,10 @@ from sglang.srt.managers.io_struct import (
58
58
  )
59
59
  from sglang.srt.managers.scheduler import run_scheduler_process
60
60
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
61
- from sglang.srt.openai_api.adapter import load_chat_template_for_openai_api
61
+ from sglang.srt.openai_api.adapter import (
62
+ guess_chat_template_name_from_model_path,
63
+ load_chat_template_for_openai_api,
64
+ )
62
65
  from sglang.srt.server_args import PortArgs, ServerArgs
63
66
  from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
64
67
  from sglang.srt.utils import (
@@ -123,7 +126,6 @@ class Engine(EngineBase):
123
126
  server_args=server_args,
124
127
  port_args=port_args,
125
128
  )
126
-
127
129
  self.server_args = server_args
128
130
  self.tokenizer_manager = tokenizer_manager
129
131
  self.scheduler_info = scheduler_info
@@ -298,7 +300,6 @@ class Engine(EngineBase):
298
300
  internal_states = loop.run_until_complete(
299
301
  self.tokenizer_manager.get_internal_state()
300
302
  )
301
-
302
303
  return {
303
304
  **dataclasses.asdict(self.tokenizer_manager.server_args),
304
305
  **self.scheduler_info,
@@ -450,7 +451,7 @@ def _set_envs_and_config(server_args: ServerArgs):
450
451
  if server_args.attention_backend == "flashinfer":
451
452
  assert_pkg_version(
452
453
  "flashinfer_python",
453
- "0.2.3",
454
+ "0.2.5",
454
455
  "Please uninstall the old version and "
455
456
  "reinstall the latest version by following the instructions "
456
457
  "at https://docs.flashinfer.ai/installation.html.",
@@ -458,7 +459,7 @@ def _set_envs_and_config(server_args: ServerArgs):
458
459
  if _is_cuda:
459
460
  assert_pkg_version(
460
461
  "sgl-kernel",
461
- "0.1.0",
462
+ "0.1.1",
462
463
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
463
464
  )
464
465
 
@@ -517,25 +518,44 @@ def _launch_subprocesses(
517
518
  )
518
519
 
519
520
  scheduler_pipe_readers = []
520
- tp_size_per_node = server_args.tp_size // server_args.nnodes
521
+
522
+ nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
523
+ tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
521
524
  tp_rank_range = range(
522
- tp_size_per_node * server_args.node_rank,
523
- tp_size_per_node * (server_args.node_rank + 1),
525
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
526
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
524
527
  )
525
- for tp_rank in tp_rank_range:
526
- reader, writer = mp.Pipe(duplex=False)
527
- gpu_id = (
528
- server_args.base_gpu_id
529
- + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
530
- )
531
- proc = mp.Process(
532
- target=run_scheduler_process,
533
- args=(server_args, port_args, gpu_id, tp_rank, None, writer),
534
- )
535
- with memory_saver_adapter.configure_subprocess():
536
- proc.start()
537
- scheduler_procs.append(proc)
538
- scheduler_pipe_readers.append(reader)
528
+
529
+ pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
530
+ pp_rank_range = range(
531
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
532
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
533
+ )
534
+
535
+ for pp_rank in pp_rank_range:
536
+ for tp_rank in tp_rank_range:
537
+ reader, writer = mp.Pipe(duplex=False)
538
+ gpu_id = (
539
+ server_args.base_gpu_id
540
+ + ((pp_rank % pp_size_per_node) * tp_size_per_node)
541
+ + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
542
+ )
543
+ proc = mp.Process(
544
+ target=run_scheduler_process,
545
+ args=(
546
+ server_args,
547
+ port_args,
548
+ gpu_id,
549
+ tp_rank,
550
+ pp_rank,
551
+ None,
552
+ writer,
553
+ ),
554
+ )
555
+ with memory_saver_adapter.configure_subprocess():
556
+ proc.start()
557
+ scheduler_procs.append(proc)
558
+ scheduler_pipe_readers.append(reader)
539
559
  else:
540
560
  # Launch the data parallel controller
541
561
  reader, writer = mp.Pipe(duplex=False)
@@ -584,6 +604,8 @@ def _launch_subprocesses(
584
604
  load_chat_template_for_openai_api(
585
605
  tokenizer_manager, server_args.chat_template, server_args.model_path
586
606
  )
607
+ else:
608
+ guess_chat_template_name_from_model_path(server_args.model_path)
587
609
 
588
610
  if server_args.completion_template:
589
611
  load_completion_template_for_openai_api(server_args.completion_template)
@@ -1,3 +1,4 @@
1
+ import ast
1
2
  import json
2
3
  import logging
3
4
  import re
@@ -664,6 +665,101 @@ class MultiFormatParser:
664
665
  return final_normal_text, final_calls
665
666
 
666
667
 
668
+ class PythonicDetector(BaseFormatDetector):
669
+ """
670
+ Detector for Llama-3.2 and Llama-4 models with pythonic tool call format.
671
+ Assumes function call format:
672
+ [tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
673
+ Arguments are Python literals (not JSON).
674
+ """
675
+
676
+ def __init__(self):
677
+ super().__init__()
678
+ self.tool_call_regex = re.compile(
679
+ r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
680
+ re.DOTALL,
681
+ )
682
+
683
+ def has_tool_call(self, text: str) -> bool:
684
+ return bool(self.tool_call_regex.match(text.strip()))
685
+
686
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
687
+ # Try parsing the text as a Python list of function calls
688
+ text = text.strip()
689
+ if not (text.startswith("[") and text.endswith("]")):
690
+ # Not a pythonic tool call format
691
+ return StreamingParseResult(normal_text=text, calls=[])
692
+ try:
693
+ module = ast.parse(text)
694
+ parsed = getattr(module.body[0], "value", None)
695
+ if not (
696
+ isinstance(parsed, ast.List)
697
+ and all(isinstance(e, ast.Call) for e in parsed.elts)
698
+ ):
699
+ return StreamingParseResult(normal_text=text, calls=[])
700
+ calls = []
701
+ tool_indices = {
702
+ tool.function.name: i
703
+ for i, tool in enumerate(tools)
704
+ if tool.function.name
705
+ }
706
+ for call in parsed.elts:
707
+ if not isinstance(call.func, ast.Name):
708
+ continue
709
+ function_name = call.func.id
710
+ arguments = {}
711
+ for keyword in call.keywords:
712
+ arguments[keyword.arg] = self._get_parameter_value(keyword.value)
713
+ calls.append(
714
+ ToolCallItem(
715
+ tool_index=tool_indices.get(function_name, -1),
716
+ name=function_name,
717
+ parameters=json.dumps(arguments, ensure_ascii=False),
718
+ )
719
+ )
720
+ return StreamingParseResult(normal_text="", calls=calls)
721
+ except Exception:
722
+ logger.exception("Error in pythonic tool call parsing.")
723
+ return StreamingParseResult(normal_text=text, calls=[])
724
+
725
+ def parse_streaming_increment(
726
+ self, new_text: str, tools: List[Tool]
727
+ ) -> StreamingParseResult:
728
+ """
729
+ Streaming incremental parsing for pythonic tool calls.
730
+ Buffers input until a complete pythonic tool call (from [ to ]) is found,
731
+ then parses and emits any detected calls.
732
+ """
733
+ self._buffer += new_text
734
+ start = self._buffer.find("[")
735
+ end = self._buffer.find("]", start)
736
+ if start != -1 and end != -1:
737
+ call_text = self._buffer[start : end + 1]
738
+ result = self.detect_and_parse(call_text, tools)
739
+ self._buffer = self._buffer[end + 1 :]
740
+ return result
741
+ return StreamingParseResult(normal_text="")
742
+
743
+ def _get_parameter_value(self, val):
744
+ if isinstance(val, ast.Constant):
745
+ return val.value
746
+ elif isinstance(val, ast.Dict):
747
+ return {
748
+ k.value: self._get_parameter_value(v)
749
+ for k, v in zip(val.keys, val.values)
750
+ }
751
+ elif isinstance(val, ast.List):
752
+ return [self._get_parameter_value(v) for v in val.elts]
753
+ else:
754
+ raise ValueError("Tool call arguments must be literals")
755
+
756
+ def structure_info(self) -> _GetInfoFunc:
757
+ def info(name: str):
758
+ return StructureInfo(begin="[", end="]", trigger="")
759
+
760
+ return info
761
+
762
+
667
763
  class FunctionCallParser:
668
764
  """
669
765
  In streaming scenarios, each time new_text is received, it calls multi_format_parser.parse_streaming_increment
@@ -675,6 +771,7 @@ class FunctionCallParser:
675
771
  "qwen25": Qwen25Detector,
676
772
  "mistral": MistralDetector,
677
773
  "deepseekv3": DeepSeekV3Detector,
774
+ "pythonic": PythonicDetector,
678
775
  }
679
776
 
680
777
  def __init__(self, tools: List[Tool], tool_call_parser: str):
@@ -35,6 +35,7 @@ from sglang.srt.configs import (
35
35
  DbrxConfig,
36
36
  DeepseekVL2Config,
37
37
  ExaoneConfig,
38
+ KimiVLConfig,
38
39
  MultiModalityConfig,
39
40
  )
40
41
  from sglang.srt.connector import create_remote_connector
@@ -46,6 +47,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
46
47
  ExaoneConfig.model_type: ExaoneConfig,
47
48
  DeepseekVL2Config.model_type: DeepseekVL2Config,
48
49
  MultiModalityConfig.model_type: MultiModalityConfig,
50
+ KimiVLConfig.model_type: KimiVLConfig,
49
51
  }
50
52
 
51
53
  for name, cls in _CONFIG_REGISTRY.items():
@@ -268,7 +268,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
268
268
  reshape_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
269
269
 
270
270
  o = cutlass_mla_decode(
271
- q_nope_and_q_pe=reshape_q,
271
+ q_nope_and_q_pe=reshape_q.to(self.q_data_type),
272
272
  kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
273
273
  seq_lens=forward_batch.seq_lens.to(torch.int32),
274
274
  page_table=self.forward_metadata.block_kv_indices,