sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -0
- sglang/check_env.py +3 -3
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/kimi_vl.py +38 -0
- sglang/srt/configs/kimi_vl_moonvit.py +32 -0
- sglang/srt/configs/model_config.py +15 -0
- sglang/srt/conversation.py +122 -1
- sglang/srt/entrypoints/engine.py +44 -22
- sglang/srt/function_call_parser.py +97 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +107 -82
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
- sglang/srt/layers/attention/flashmla_backend.py +3 -0
- sglang/srt/layers/dp_attention.py +5 -2
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
- sglang/srt/layers/quantization/__init__.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +1 -1
- sglang/srt/layers/utils.py +35 -0
- sglang/srt/lora/layers.py +35 -9
- sglang/srt/lora/lora_manager.py +84 -35
- sglang/srt/managers/data_parallel_controller.py +52 -34
- sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
- sglang/srt/managers/schedule_batch.py +25 -15
- sglang/srt/managers/scheduler.py +263 -59
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
- sglang/srt/managers/tp_worker.py +51 -16
- sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
- sglang/srt/mem_cache/memory_pool.py +70 -36
- sglang/srt/model_executor/cuda_graph_runner.py +82 -19
- sglang/srt/model_executor/forward_batch_info.py +31 -1
- sglang/srt/model_executor/model_runner.py +115 -57
- sglang/srt/models/deepseek_nextn.py +1 -257
- sglang/srt/models/deepseek_v2.py +78 -18
- sglang/srt/models/kimi_vl.py +308 -0
- sglang/srt/models/kimi_vl_moonvit.py +639 -0
- sglang/srt/models/llama.py +92 -30
- sglang/srt/models/llama4.py +2 -1
- sglang/srt/models/llama_eagle.py +4 -1
- sglang/srt/models/llama_eagle3.py +4 -1
- sglang/srt/models/qwen2_moe.py +8 -3
- sglang/srt/models/qwen2_vl.py +0 -12
- sglang/srt/models/qwen3_moe.py +8 -3
- sglang/srt/openai_api/adapter.py +34 -22
- sglang/srt/openai_api/protocol.py +11 -1
- sglang/srt/server_args.py +67 -22
- sglang/srt/speculative/eagle_worker.py +3 -2
- sglang/srt/utils.py +88 -9
- sglang/test/runners.py +4 -0
- sglang/test/test_utils.py +29 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +61 -51
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
sglang/check_env.py
CHANGED
@@ -20,7 +20,7 @@ def is_cuda_v2():
|
|
20
20
|
PACKAGE_LIST = [
|
21
21
|
"sglang",
|
22
22
|
"sgl_kernel",
|
23
|
-
"
|
23
|
+
"flashinfer_python",
|
24
24
|
"triton",
|
25
25
|
"transformers",
|
26
26
|
"torchao",
|
@@ -36,8 +36,8 @@ PACKAGE_LIST = [
|
|
36
36
|
"packaging",
|
37
37
|
"psutil",
|
38
38
|
"pydantic",
|
39
|
-
"multipart",
|
40
|
-
"
|
39
|
+
"python-multipart",
|
40
|
+
"pyzmq",
|
41
41
|
"torchao",
|
42
42
|
"uvicorn",
|
43
43
|
"uvloop",
|
sglang/srt/configs/__init__.py
CHANGED
@@ -3,6 +3,8 @@ from sglang.srt.configs.dbrx import DbrxConfig
|
|
3
3
|
from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
|
4
4
|
from sglang.srt.configs.exaone import ExaoneConfig
|
5
5
|
from sglang.srt.configs.janus_pro import MultiModalityConfig
|
6
|
+
from sglang.srt.configs.kimi_vl import KimiVLConfig
|
7
|
+
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
6
8
|
|
7
9
|
__all__ = [
|
8
10
|
"ExaoneConfig",
|
@@ -10,4 +12,6 @@ __all__ = [
|
|
10
12
|
"DbrxConfig",
|
11
13
|
"DeepseekVL2Config",
|
12
14
|
"MultiModalityConfig",
|
15
|
+
"KimiVLConfig",
|
16
|
+
"MoonViTConfig",
|
13
17
|
]
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
|
3
|
+
from typing import Optional, Union
|
4
|
+
|
5
|
+
from transformers.configuration_utils import PretrainedConfig
|
6
|
+
|
7
|
+
from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
|
8
|
+
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
9
|
+
|
10
|
+
|
11
|
+
class KimiVLConfig(PretrainedConfig):
|
12
|
+
model_type = "kimi_vl"
|
13
|
+
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
|
17
|
+
text_config: Optional[Union[dict, DeepseekV2Config]] = None,
|
18
|
+
ignore_index: int = -100,
|
19
|
+
media_placeholder_token_id: int = 163605,
|
20
|
+
pad_token_id: int = 0,
|
21
|
+
**kwargs
|
22
|
+
):
|
23
|
+
if vision_config is None:
|
24
|
+
vision_config = MoonViTConfig()
|
25
|
+
elif isinstance(vision_config, dict):
|
26
|
+
vision_config = MoonViTConfig(**vision_config)
|
27
|
+
self.vision_config = vision_config
|
28
|
+
|
29
|
+
if text_config is None:
|
30
|
+
text_config = DeepseekV2Config()
|
31
|
+
elif isinstance(text_config, dict):
|
32
|
+
text_config = DeepseekV2Config(**text_config)
|
33
|
+
self.text_config = text_config
|
34
|
+
|
35
|
+
self.ignore_index = ignore_index
|
36
|
+
self.media_placeholder_token_id = media_placeholder_token_id
|
37
|
+
|
38
|
+
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
|
3
|
+
from transformers.configuration_utils import PretrainedConfig
|
4
|
+
|
5
|
+
|
6
|
+
class MoonViTConfig(PretrainedConfig):
|
7
|
+
model_type = "moonvit"
|
8
|
+
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
patch_size: int = 14,
|
12
|
+
init_pos_emb_height: int = 64,
|
13
|
+
init_pos_emb_width: int = 64,
|
14
|
+
num_attention_heads: int = 16,
|
15
|
+
num_hidden_layers: int = 27,
|
16
|
+
hidden_size: int = 1152,
|
17
|
+
intermediate_size: int = 4304,
|
18
|
+
merge_kernel_size: tuple[int, int] = (2, 2),
|
19
|
+
**kwargs,
|
20
|
+
):
|
21
|
+
super().__init__(**kwargs)
|
22
|
+
self.patch_size = patch_size
|
23
|
+
# Positional embedding config
|
24
|
+
self.init_pos_emb_height = init_pos_emb_height
|
25
|
+
self.init_pos_emb_width = init_pos_emb_width
|
26
|
+
# Transformer config
|
27
|
+
self.num_hidden_layers = num_hidden_layers
|
28
|
+
self.num_attention_heads = num_attention_heads
|
29
|
+
self.hidden_size = hidden_size
|
30
|
+
self.intermediate_size = intermediate_size
|
31
|
+
# Patch merger config
|
32
|
+
self.merge_kernel_size = merge_kernel_size
|
@@ -47,6 +47,7 @@ class ModelConfig:
|
|
47
47
|
dtype: str = "auto",
|
48
48
|
quantization: Optional[str] = None,
|
49
49
|
override_config_file: Optional[str] = None,
|
50
|
+
is_draft_model: bool = False,
|
50
51
|
) -> None:
|
51
52
|
|
52
53
|
self.model_path = model_path
|
@@ -85,6 +86,12 @@ class ModelConfig:
|
|
85
86
|
else:
|
86
87
|
enable_multimodal = True
|
87
88
|
|
89
|
+
if (
|
90
|
+
is_draft_model
|
91
|
+
and self.hf_config.architectures[0] == "DeepseekV3ForCausalLM"
|
92
|
+
):
|
93
|
+
self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
|
94
|
+
|
88
95
|
# Check model type
|
89
96
|
self.is_generation = is_generation_model(
|
90
97
|
self.hf_config.architectures, is_embedding
|
@@ -169,6 +176,13 @@ class ModelConfig:
|
|
169
176
|
self.attention_arch = AttentionArch.MLA
|
170
177
|
self.kv_lora_rank = self.hf_text_config.kv_lora_rank
|
171
178
|
self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
|
179
|
+
elif "KimiVLForConditionalGeneration" in self.hf_config.architectures:
|
180
|
+
self.head_dim = 256
|
181
|
+
self.attention_arch = AttentionArch.MLA
|
182
|
+
self.kv_lora_rank = self.hf_text_config.kv_lora_rank
|
183
|
+
self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
|
184
|
+
self.v_head_dim = self.hf_text_config.v_head_dim
|
185
|
+
self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
|
172
186
|
else:
|
173
187
|
self.attention_arch = AttentionArch.MHA
|
174
188
|
|
@@ -523,6 +537,7 @@ multimodal_model_archs = [
|
|
523
537
|
"Qwen2VLForConditionalGeneration",
|
524
538
|
"Qwen2_5_VLForConditionalGeneration",
|
525
539
|
"CLIPModel",
|
540
|
+
"KimiVLForConditionalGeneration",
|
526
541
|
]
|
527
542
|
|
528
543
|
|
sglang/srt/conversation.py
CHANGED
@@ -17,7 +17,7 @@
|
|
17
17
|
# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
18
18
|
import dataclasses
|
19
19
|
from enum import IntEnum, auto
|
20
|
-
from typing import Dict, List, Optional, Tuple, Union
|
20
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
21
21
|
|
22
22
|
from sglang.srt.openai_api.protocol import ChatCompletionRequest
|
23
23
|
|
@@ -407,6 +407,7 @@ class Conversation:
|
|
407
407
|
|
408
408
|
# A global registry for all conversation templates
|
409
409
|
chat_templates: Dict[str, Conversation] = {}
|
410
|
+
matching_function_registry: List[Callable] = []
|
410
411
|
|
411
412
|
|
412
413
|
def register_conv_template(template: Conversation, override: bool = False):
|
@@ -419,6 +420,18 @@ def register_conv_template(template: Conversation, override: bool = False):
|
|
419
420
|
chat_templates[template.name] = template
|
420
421
|
|
421
422
|
|
423
|
+
def register_conv_template_matching_function(func):
|
424
|
+
matching_function_registry.append(func)
|
425
|
+
|
426
|
+
|
427
|
+
def get_conv_template_by_model_path(model_path):
|
428
|
+
for matching_func in matching_function_registry:
|
429
|
+
conv_name = matching_func(model_path)
|
430
|
+
if conv_name is not None:
|
431
|
+
return conv_name
|
432
|
+
return None
|
433
|
+
|
434
|
+
|
422
435
|
def chat_template_exists(template_name: str) -> bool:
|
423
436
|
return template_name in chat_templates
|
424
437
|
|
@@ -792,3 +805,111 @@ register_conv_template(
|
|
792
805
|
audio_token="(<audio>./</audio>)",
|
793
806
|
)
|
794
807
|
)
|
808
|
+
|
809
|
+
# Reference: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/chat_template.jinja
|
810
|
+
register_conv_template(
|
811
|
+
Conversation(
|
812
|
+
name="kimi-vl",
|
813
|
+
system_message="You are a helpful assistant",
|
814
|
+
system_template="<|im_system|>system<|im_middle|>{system_message}",
|
815
|
+
roles=(
|
816
|
+
"<|im_user|>user<|im_middle|>",
|
817
|
+
"<|im_assistant|>assistant<|im_middle|>",
|
818
|
+
),
|
819
|
+
messages=[],
|
820
|
+
sep="<|im_end|>",
|
821
|
+
sep_style=SeparatorStyle.NO_COLON_SINGLE,
|
822
|
+
stop_str="<|im_end|>",
|
823
|
+
image_token="<|media_start|>image<|media_content|><|media_pad|><|media_end|>",
|
824
|
+
)
|
825
|
+
)
|
826
|
+
|
827
|
+
|
828
|
+
@register_conv_template_matching_function
|
829
|
+
def match_deepseek_janus_pro(model_path: str):
|
830
|
+
if (
|
831
|
+
"llama" in model_path.lower()
|
832
|
+
and "3.2" in model_path.lower()
|
833
|
+
and "vision" in model_path.lower()
|
834
|
+
):
|
835
|
+
return "llama_3_vision"
|
836
|
+
|
837
|
+
|
838
|
+
@register_conv_template_matching_function
|
839
|
+
def match_deepseek_janus_pro(model_path: str):
|
840
|
+
if "janus" in model_path.lower():
|
841
|
+
return "janus-pro"
|
842
|
+
|
843
|
+
|
844
|
+
@register_conv_template_matching_function
|
845
|
+
def match_vicuna(model_path: str):
|
846
|
+
if "vicuna" in model_path.lower():
|
847
|
+
return "vicuna_v1.1"
|
848
|
+
if "llava-v1.5" in model_path.lower():
|
849
|
+
return "vicuna_v1.1"
|
850
|
+
if "llava-next-video-7b" in model_path.lower():
|
851
|
+
return "vicuna_v1.1"
|
852
|
+
|
853
|
+
|
854
|
+
@register_conv_template_matching_function
|
855
|
+
def match_llama2_chat(model_path: str):
|
856
|
+
model_path = model_path.lower()
|
857
|
+
if "llama-2" in model_path and "chat" in model_path:
|
858
|
+
return "llama-2"
|
859
|
+
if (
|
860
|
+
"mistral" in model_path or "mixtral" in model_path
|
861
|
+
) and "instruct" in model_path:
|
862
|
+
return "llama-2"
|
863
|
+
if "codellama" in model_path and "instruct" in model_path:
|
864
|
+
return "llama-2"
|
865
|
+
|
866
|
+
|
867
|
+
@register_conv_template_matching_function
|
868
|
+
def match_deepseek_vl(model_path: str):
|
869
|
+
model_path = model_path.lower()
|
870
|
+
if "deepseek" in model_path and "vl2" in model_path:
|
871
|
+
return "deepseek-vl2"
|
872
|
+
|
873
|
+
|
874
|
+
@register_conv_template_matching_function
|
875
|
+
def match_chat_ml(model_path: str):
|
876
|
+
# import pdb;pdb.set_trace()
|
877
|
+
model_path = model_path.lower()
|
878
|
+
# Now the suffix for qwen2 chat model is "instruct"
|
879
|
+
if "gme" in model_path and "qwen" in model_path and "vl" in model_path:
|
880
|
+
return "gme-qwen2-vl"
|
881
|
+
if "qwen" in model_path and "vl" in model_path:
|
882
|
+
return "qwen2-vl"
|
883
|
+
if (
|
884
|
+
"llava-v1.6-34b" in model_path
|
885
|
+
or "llava-v1.6-yi-34b" in model_path
|
886
|
+
or "llava-next-video-34b" in model_path
|
887
|
+
or "llava-onevision-qwen2" in model_path
|
888
|
+
):
|
889
|
+
return "chatml-llava"
|
890
|
+
|
891
|
+
|
892
|
+
@register_conv_template_matching_function
|
893
|
+
def match_gemma_it(model_path: str):
|
894
|
+
model_path = model_path.lower()
|
895
|
+
if "gemma" in model_path and "it" in model_path:
|
896
|
+
return "gemma-it"
|
897
|
+
if "gemma-3" in model_path and "1b" not in model_path:
|
898
|
+
# gemma-3-1b-it is completion model
|
899
|
+
return "gemma-it"
|
900
|
+
|
901
|
+
|
902
|
+
@register_conv_template_matching_function
|
903
|
+
def match_openbmb_minicpm(model_path: str):
|
904
|
+
model_path = model_path.lower()
|
905
|
+
if "minicpm-v" in model_path:
|
906
|
+
return "minicpmv"
|
907
|
+
elif "minicpm-o" in model_path:
|
908
|
+
return "minicpmo"
|
909
|
+
|
910
|
+
|
911
|
+
@register_conv_template_matching_function
|
912
|
+
def match_moonshot_kimivl(model_path: str):
|
913
|
+
model_path = model_path.lower()
|
914
|
+
if "kimi" in model_path and "vl" in model_path:
|
915
|
+
return "kimi-vl"
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -58,7 +58,10 @@ from sglang.srt.managers.io_struct import (
|
|
58
58
|
)
|
59
59
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
60
60
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
61
|
-
from sglang.srt.openai_api.adapter import
|
61
|
+
from sglang.srt.openai_api.adapter import (
|
62
|
+
guess_chat_template_name_from_model_path,
|
63
|
+
load_chat_template_for_openai_api,
|
64
|
+
)
|
62
65
|
from sglang.srt.server_args import PortArgs, ServerArgs
|
63
66
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
64
67
|
from sglang.srt.utils import (
|
@@ -123,7 +126,6 @@ class Engine(EngineBase):
|
|
123
126
|
server_args=server_args,
|
124
127
|
port_args=port_args,
|
125
128
|
)
|
126
|
-
|
127
129
|
self.server_args = server_args
|
128
130
|
self.tokenizer_manager = tokenizer_manager
|
129
131
|
self.scheduler_info = scheduler_info
|
@@ -298,7 +300,6 @@ class Engine(EngineBase):
|
|
298
300
|
internal_states = loop.run_until_complete(
|
299
301
|
self.tokenizer_manager.get_internal_state()
|
300
302
|
)
|
301
|
-
|
302
303
|
return {
|
303
304
|
**dataclasses.asdict(self.tokenizer_manager.server_args),
|
304
305
|
**self.scheduler_info,
|
@@ -450,7 +451,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
450
451
|
if server_args.attention_backend == "flashinfer":
|
451
452
|
assert_pkg_version(
|
452
453
|
"flashinfer_python",
|
453
|
-
"0.2.
|
454
|
+
"0.2.5",
|
454
455
|
"Please uninstall the old version and "
|
455
456
|
"reinstall the latest version by following the instructions "
|
456
457
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -458,7 +459,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
458
459
|
if _is_cuda:
|
459
460
|
assert_pkg_version(
|
460
461
|
"sgl-kernel",
|
461
|
-
"0.1.
|
462
|
+
"0.1.1",
|
462
463
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
463
464
|
)
|
464
465
|
|
@@ -517,25 +518,44 @@ def _launch_subprocesses(
|
|
517
518
|
)
|
518
519
|
|
519
520
|
scheduler_pipe_readers = []
|
520
|
-
|
521
|
+
|
522
|
+
nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
|
523
|
+
tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
|
521
524
|
tp_rank_range = range(
|
522
|
-
tp_size_per_node * server_args.node_rank,
|
523
|
-
tp_size_per_node * (server_args.node_rank + 1),
|
525
|
+
tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
|
526
|
+
tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
|
524
527
|
)
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
528
|
+
|
529
|
+
pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
|
530
|
+
pp_rank_range = range(
|
531
|
+
pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
|
532
|
+
pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
|
533
|
+
)
|
534
|
+
|
535
|
+
for pp_rank in pp_rank_range:
|
536
|
+
for tp_rank in tp_rank_range:
|
537
|
+
reader, writer = mp.Pipe(duplex=False)
|
538
|
+
gpu_id = (
|
539
|
+
server_args.base_gpu_id
|
540
|
+
+ ((pp_rank % pp_size_per_node) * tp_size_per_node)
|
541
|
+
+ (tp_rank % tp_size_per_node) * server_args.gpu_id_step
|
542
|
+
)
|
543
|
+
proc = mp.Process(
|
544
|
+
target=run_scheduler_process,
|
545
|
+
args=(
|
546
|
+
server_args,
|
547
|
+
port_args,
|
548
|
+
gpu_id,
|
549
|
+
tp_rank,
|
550
|
+
pp_rank,
|
551
|
+
None,
|
552
|
+
writer,
|
553
|
+
),
|
554
|
+
)
|
555
|
+
with memory_saver_adapter.configure_subprocess():
|
556
|
+
proc.start()
|
557
|
+
scheduler_procs.append(proc)
|
558
|
+
scheduler_pipe_readers.append(reader)
|
539
559
|
else:
|
540
560
|
# Launch the data parallel controller
|
541
561
|
reader, writer = mp.Pipe(duplex=False)
|
@@ -584,6 +604,8 @@ def _launch_subprocesses(
|
|
584
604
|
load_chat_template_for_openai_api(
|
585
605
|
tokenizer_manager, server_args.chat_template, server_args.model_path
|
586
606
|
)
|
607
|
+
else:
|
608
|
+
guess_chat_template_name_from_model_path(server_args.model_path)
|
587
609
|
|
588
610
|
if server_args.completion_template:
|
589
611
|
load_completion_template_for_openai_api(server_args.completion_template)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import ast
|
1
2
|
import json
|
2
3
|
import logging
|
3
4
|
import re
|
@@ -664,6 +665,101 @@ class MultiFormatParser:
|
|
664
665
|
return final_normal_text, final_calls
|
665
666
|
|
666
667
|
|
668
|
+
class PythonicDetector(BaseFormatDetector):
|
669
|
+
"""
|
670
|
+
Detector for Llama-3.2 and Llama-4 models with pythonic tool call format.
|
671
|
+
Assumes function call format:
|
672
|
+
[tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
|
673
|
+
Arguments are Python literals (not JSON).
|
674
|
+
"""
|
675
|
+
|
676
|
+
def __init__(self):
|
677
|
+
super().__init__()
|
678
|
+
self.tool_call_regex = re.compile(
|
679
|
+
r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
|
680
|
+
re.DOTALL,
|
681
|
+
)
|
682
|
+
|
683
|
+
def has_tool_call(self, text: str) -> bool:
|
684
|
+
return bool(self.tool_call_regex.match(text.strip()))
|
685
|
+
|
686
|
+
def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
|
687
|
+
# Try parsing the text as a Python list of function calls
|
688
|
+
text = text.strip()
|
689
|
+
if not (text.startswith("[") and text.endswith("]")):
|
690
|
+
# Not a pythonic tool call format
|
691
|
+
return StreamingParseResult(normal_text=text, calls=[])
|
692
|
+
try:
|
693
|
+
module = ast.parse(text)
|
694
|
+
parsed = getattr(module.body[0], "value", None)
|
695
|
+
if not (
|
696
|
+
isinstance(parsed, ast.List)
|
697
|
+
and all(isinstance(e, ast.Call) for e in parsed.elts)
|
698
|
+
):
|
699
|
+
return StreamingParseResult(normal_text=text, calls=[])
|
700
|
+
calls = []
|
701
|
+
tool_indices = {
|
702
|
+
tool.function.name: i
|
703
|
+
for i, tool in enumerate(tools)
|
704
|
+
if tool.function.name
|
705
|
+
}
|
706
|
+
for call in parsed.elts:
|
707
|
+
if not isinstance(call.func, ast.Name):
|
708
|
+
continue
|
709
|
+
function_name = call.func.id
|
710
|
+
arguments = {}
|
711
|
+
for keyword in call.keywords:
|
712
|
+
arguments[keyword.arg] = self._get_parameter_value(keyword.value)
|
713
|
+
calls.append(
|
714
|
+
ToolCallItem(
|
715
|
+
tool_index=tool_indices.get(function_name, -1),
|
716
|
+
name=function_name,
|
717
|
+
parameters=json.dumps(arguments, ensure_ascii=False),
|
718
|
+
)
|
719
|
+
)
|
720
|
+
return StreamingParseResult(normal_text="", calls=calls)
|
721
|
+
except Exception:
|
722
|
+
logger.exception("Error in pythonic tool call parsing.")
|
723
|
+
return StreamingParseResult(normal_text=text, calls=[])
|
724
|
+
|
725
|
+
def parse_streaming_increment(
|
726
|
+
self, new_text: str, tools: List[Tool]
|
727
|
+
) -> StreamingParseResult:
|
728
|
+
"""
|
729
|
+
Streaming incremental parsing for pythonic tool calls.
|
730
|
+
Buffers input until a complete pythonic tool call (from [ to ]) is found,
|
731
|
+
then parses and emits any detected calls.
|
732
|
+
"""
|
733
|
+
self._buffer += new_text
|
734
|
+
start = self._buffer.find("[")
|
735
|
+
end = self._buffer.find("]", start)
|
736
|
+
if start != -1 and end != -1:
|
737
|
+
call_text = self._buffer[start : end + 1]
|
738
|
+
result = self.detect_and_parse(call_text, tools)
|
739
|
+
self._buffer = self._buffer[end + 1 :]
|
740
|
+
return result
|
741
|
+
return StreamingParseResult(normal_text="")
|
742
|
+
|
743
|
+
def _get_parameter_value(self, val):
|
744
|
+
if isinstance(val, ast.Constant):
|
745
|
+
return val.value
|
746
|
+
elif isinstance(val, ast.Dict):
|
747
|
+
return {
|
748
|
+
k.value: self._get_parameter_value(v)
|
749
|
+
for k, v in zip(val.keys, val.values)
|
750
|
+
}
|
751
|
+
elif isinstance(val, ast.List):
|
752
|
+
return [self._get_parameter_value(v) for v in val.elts]
|
753
|
+
else:
|
754
|
+
raise ValueError("Tool call arguments must be literals")
|
755
|
+
|
756
|
+
def structure_info(self) -> _GetInfoFunc:
|
757
|
+
def info(name: str):
|
758
|
+
return StructureInfo(begin="[", end="]", trigger="")
|
759
|
+
|
760
|
+
return info
|
761
|
+
|
762
|
+
|
667
763
|
class FunctionCallParser:
|
668
764
|
"""
|
669
765
|
In streaming scenarios, each time new_text is received, it calls multi_format_parser.parse_streaming_increment
|
@@ -675,6 +771,7 @@ class FunctionCallParser:
|
|
675
771
|
"qwen25": Qwen25Detector,
|
676
772
|
"mistral": MistralDetector,
|
677
773
|
"deepseekv3": DeepSeekV3Detector,
|
774
|
+
"pythonic": PythonicDetector,
|
678
775
|
}
|
679
776
|
|
680
777
|
def __init__(self, tools: List[Tool], tool_call_parser: str):
|
@@ -35,6 +35,7 @@ from sglang.srt.configs import (
|
|
35
35
|
DbrxConfig,
|
36
36
|
DeepseekVL2Config,
|
37
37
|
ExaoneConfig,
|
38
|
+
KimiVLConfig,
|
38
39
|
MultiModalityConfig,
|
39
40
|
)
|
40
41
|
from sglang.srt.connector import create_remote_connector
|
@@ -46,6 +47,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
|
46
47
|
ExaoneConfig.model_type: ExaoneConfig,
|
47
48
|
DeepseekVL2Config.model_type: DeepseekVL2Config,
|
48
49
|
MultiModalityConfig.model_type: MultiModalityConfig,
|
50
|
+
KimiVLConfig.model_type: KimiVLConfig,
|
49
51
|
}
|
50
52
|
|
51
53
|
for name, cls in _CONFIG_REGISTRY.items():
|
@@ -268,7 +268,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
|
|
268
268
|
reshape_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
|
269
269
|
|
270
270
|
o = cutlass_mla_decode(
|
271
|
-
q_nope_and_q_pe=reshape_q,
|
271
|
+
q_nope_and_q_pe=reshape_q.to(self.q_data_type),
|
272
272
|
kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
|
273
273
|
seq_lens=forward_batch.seq_lens.to(torch.int32),
|
274
274
|
page_table=self.forward_metadata.block_kv_indices,
|