sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/chat_template.py +21 -0
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/model_config.py +4 -0
- sglang/srt/constrained/base_grammar_backend.py +10 -2
- sglang/srt/constrained/xgrammar_backend.py +7 -5
- sglang/srt/conversation.py +16 -1
- sglang/srt/debug_utils/__init__.py +0 -0
- sglang/srt/debug_utils/dump_comparator.py +131 -0
- sglang/srt/debug_utils/dumper.py +108 -0
- sglang/srt/debug_utils/text_comparator.py +172 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
- sglang/srt/disaggregation/mooncake/conn.py +16 -0
- sglang/srt/disaggregation/prefill.py +13 -1
- sglang/srt/entrypoints/engine.py +4 -2
- sglang/srt/entrypoints/openai/serving_chat.py +132 -79
- sglang/srt/function_call/ebnf_composer.py +10 -3
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +164 -0
- sglang/srt/function_call/qwen3_coder_detector.py +1 -0
- sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
- sglang/srt/layers/attention/vision.py +56 -8
- sglang/srt/layers/layernorm.py +26 -1
- sglang/srt/layers/logits_processor.py +14 -3
- sglang/srt/layers/moe/ep_moe/layer.py +172 -206
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
- sglang/srt/layers/moe/topk.py +84 -22
- sglang/srt/layers/multimodal.py +11 -8
- sglang/srt/layers/quantization/fp8.py +25 -247
- sglang/srt/layers/quantization/fp8_kernel.py +78 -48
- sglang/srt/layers/quantization/modelopt_quant.py +25 -10
- sglang/srt/layers/quantization/unquant.py +24 -76
- sglang/srt/layers/quantization/w4afp8.py +68 -17
- sglang/srt/lora/lora_registry.py +93 -29
- sglang/srt/managers/cache_controller.py +9 -7
- sglang/srt/managers/mm_utils.py +154 -35
- sglang/srt/managers/multimodal_processor.py +3 -14
- sglang/srt/managers/schedule_batch.py +14 -8
- sglang/srt/managers/scheduler.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +37 -6
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/mem_cache/hiradix_cache.py +5 -2
- sglang/srt/model_executor/model_runner.py +68 -14
- sglang/srt/models/deepseek_v2.py +62 -28
- sglang/srt/models/glm4_moe.py +1035 -0
- sglang/srt/models/glm4_moe_nextn.py +167 -0
- sglang/srt/models/interns1.py +328 -0
- sglang/srt/models/internvl.py +143 -47
- sglang/srt/models/llava.py +9 -5
- sglang/srt/models/minicpmo.py +4 -1
- sglang/srt/models/qwen2_moe.py +2 -2
- sglang/srt/models/qwen3_moe.py +5 -2
- sglang/srt/multimodal/processors/base_processor.py +20 -6
- sglang/srt/multimodal/processors/clip.py +2 -2
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
- sglang/srt/multimodal/processors/gemma3.py +2 -2
- sglang/srt/multimodal/processors/gemma3n.py +2 -2
- sglang/srt/multimodal/processors/internvl.py +21 -8
- sglang/srt/multimodal/processors/janus_pro.py +2 -2
- sglang/srt/multimodal/processors/kimi_vl.py +2 -2
- sglang/srt/multimodal/processors/llava.py +4 -4
- sglang/srt/multimodal/processors/minicpm.py +2 -3
- sglang/srt/multimodal/processors/mlama.py +2 -2
- sglang/srt/multimodal/processors/mllama4.py +18 -111
- sglang/srt/multimodal/processors/phi4mm.py +2 -2
- sglang/srt/multimodal/processors/pixtral.py +2 -2
- sglang/srt/multimodal/processors/qwen_audio.py +2 -2
- sglang/srt/multimodal/processors/qwen_vl.py +2 -2
- sglang/srt/multimodal/processors/vila.py +3 -1
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +57 -6
- sglang/srt/utils.py +96 -1
- sglang/srt/weight_sync/utils.py +119 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_utils.py +65 -5
- sglang/utils.py +19 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
- sglang/srt/debug_utils.py +0 -74
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
sglang/lang/chat_template.py
CHANGED
@@ -448,6 +448,19 @@ register_chat_template(
|
|
448
448
|
)
|
449
449
|
)
|
450
450
|
|
451
|
+
register_chat_template(
|
452
|
+
ChatTemplate(
|
453
|
+
name="interns1",
|
454
|
+
default_system_prompt="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
|
455
|
+
role_prefix_and_suffix={
|
456
|
+
"system": ("<|im_start|>system\n", "<|im_end|>\n"),
|
457
|
+
"user": ("<|im_start|>user\n", "<|im_end|>\n"),
|
458
|
+
"assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
|
459
|
+
},
|
460
|
+
stop_str=["<|im_end|>", "<|action_end|>"],
|
461
|
+
)
|
462
|
+
)
|
463
|
+
|
451
464
|
register_chat_template(
|
452
465
|
ChatTemplate(
|
453
466
|
name="granite-3-instruct",
|
@@ -609,6 +622,14 @@ def match_internvl_chat(model_path: str):
|
|
609
622
|
return "internvl-2-5"
|
610
623
|
|
611
624
|
|
625
|
+
@register_chat_template_matching_function
|
626
|
+
def match_interns1_chat(model_path: str):
|
627
|
+
if re.search(r"intern-s1", model_path, re.IGNORECASE):
|
628
|
+
return "interns1"
|
629
|
+
if re.search(r"interns1", model_path, re.IGNORECASE):
|
630
|
+
return "interns1"
|
631
|
+
|
632
|
+
|
612
633
|
if __name__ == "__main__":
|
613
634
|
messages = [
|
614
635
|
{"role": "system", "content": None}, # None means default
|
sglang/srt/configs/internvl.py
CHANGED
@@ -10,6 +10,7 @@ from transformers import (
|
|
10
10
|
PretrainedConfig,
|
11
11
|
PreTrainedTokenizer,
|
12
12
|
Qwen2Config,
|
13
|
+
Qwen3Config,
|
13
14
|
)
|
14
15
|
|
15
16
|
from sglang.utils import logger
|
@@ -314,6 +315,8 @@ class InternVLChatConfig(PretrainedConfig):
|
|
314
315
|
self.llm_config = InternLM2Config(**llm_config)
|
315
316
|
elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM":
|
316
317
|
self.llm_config = Qwen2Config(**llm_config)
|
318
|
+
elif llm_config.get("architectures")[0] == "Qwen3MoeForCausalLM":
|
319
|
+
self.llm_config = Qwen3Config(**llm_config)
|
317
320
|
else:
|
318
321
|
raise ValueError(
|
319
322
|
"Unsupported architecture: {}".format(
|
@@ -127,6 +127,9 @@ class ModelConfig:
|
|
127
127
|
):
|
128
128
|
self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
|
129
129
|
|
130
|
+
if is_draft_model and self.hf_config.architectures[0] == "Glm4MoeForCausalLM":
|
131
|
+
self.hf_config.architectures[0] = "Glm4MoeForCausalLMNextN"
|
132
|
+
|
130
133
|
if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
|
131
134
|
self.hf_config.architectures[0] = "MiMoMTP"
|
132
135
|
# Check model type
|
@@ -635,6 +638,7 @@ multimodal_model_archs = [
|
|
635
638
|
"Qwen2_5_VLForConditionalGeneration",
|
636
639
|
"KimiVLForConditionalGeneration",
|
637
640
|
"InternVLChatModel",
|
641
|
+
"InternS1ForConditionalGeneration",
|
638
642
|
"Phi4MMForCausalLM",
|
639
643
|
"VILAForConditionalGeneration",
|
640
644
|
]
|
@@ -168,7 +168,10 @@ class BaseGrammarBackend:
|
|
168
168
|
|
169
169
|
|
170
170
|
def create_grammar_backend(
|
171
|
-
server_args: ServerArgs,
|
171
|
+
server_args: ServerArgs,
|
172
|
+
tokenizer,
|
173
|
+
vocab_size: int,
|
174
|
+
eos_token_ids: Optional[set] = None,
|
172
175
|
) -> Optional[BaseGrammarBackend]:
|
173
176
|
if server_args.grammar_backend == "outlines":
|
174
177
|
from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
|
@@ -180,7 +183,12 @@ def create_grammar_backend(
|
|
180
183
|
elif server_args.grammar_backend == "xgrammar":
|
181
184
|
from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
|
182
185
|
|
183
|
-
|
186
|
+
# Convert Set[int] to List[int] if needed
|
187
|
+
eos_list = list(eos_token_ids) if eos_token_ids else None
|
188
|
+
|
189
|
+
grammar_backend = XGrammarGrammarBackend(
|
190
|
+
tokenizer, vocab_size=vocab_size, model_eos_token_ids=eos_list
|
191
|
+
)
|
184
192
|
elif server_args.grammar_backend == "llguidance":
|
185
193
|
from sglang.srt.constrained.llguidance_backend import GuidanceBackend
|
186
194
|
|
@@ -150,14 +150,16 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
|
|
150
150
|
self,
|
151
151
|
tokenizer,
|
152
152
|
vocab_size: int,
|
153
|
+
model_eos_token_ids: Optional[List[int]] = None,
|
153
154
|
):
|
154
155
|
super().__init__()
|
155
156
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
157
|
+
# Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
|
158
|
+
# This ensures consistency between what the model considers EOS and what XGrammar uses
|
159
|
+
tokenizer_info = TokenizerInfo.from_huggingface(
|
160
|
+
tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
|
161
|
+
)
|
162
|
+
override_stop_tokens = None
|
161
163
|
|
162
164
|
self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
|
163
165
|
self.vocab_size = vocab_size
|
sglang/srt/conversation.py
CHANGED
@@ -623,7 +623,7 @@ def generate_chat_conv(
|
|
623
623
|
real_content += content.text
|
624
624
|
elif content.type == "image_url":
|
625
625
|
# NOTE: works for llava and intervl2_5
|
626
|
-
if conv.name
|
626
|
+
if conv.name in ["internvl-2-5", "interns1"]:
|
627
627
|
real_content = image_token + real_content
|
628
628
|
else:
|
629
629
|
real_content += image_token
|
@@ -817,6 +817,19 @@ register_conv_template(
|
|
817
817
|
)
|
818
818
|
)
|
819
819
|
|
820
|
+
register_conv_template(
|
821
|
+
Conversation(
|
822
|
+
name="interns1",
|
823
|
+
system_template="<|im_start|>system\n{system_message}",
|
824
|
+
system_message="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
|
825
|
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
826
|
+
sep_style=SeparatorStyle.MPT,
|
827
|
+
sep="<|im_end|>\n",
|
828
|
+
stop_str=["<|im_end|>", "<|action_end|>"],
|
829
|
+
image_token="<image>",
|
830
|
+
)
|
831
|
+
)
|
832
|
+
|
820
833
|
# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
|
821
834
|
register_conv_template(
|
822
835
|
Conversation(
|
@@ -986,6 +999,8 @@ register_conv_template(
|
|
986
999
|
def match_internvl(model_path: str):
|
987
1000
|
if re.search(r"internvl", model_path, re.IGNORECASE):
|
988
1001
|
return "internvl-2-5"
|
1002
|
+
if re.search(r"intern.*s1", model_path, re.IGNORECASE):
|
1003
|
+
return "interns1"
|
989
1004
|
|
990
1005
|
|
991
1006
|
@register_conv_template_matching_function
|
File without changes
|
@@ -0,0 +1,131 @@
|
|
1
|
+
import argparse
|
2
|
+
import functools
|
3
|
+
import re
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import polars as pl
|
7
|
+
import torch
|
8
|
+
|
9
|
+
from sglang.srt.debug_utils.dumper import get_truncated_value
|
10
|
+
|
11
|
+
|
12
|
+
def main(args):
|
13
|
+
df_target = read_meta(args.target_path)
|
14
|
+
df_target = df_target.sort("rank", "dump_index")
|
15
|
+
df_target = df_target.filter(
|
16
|
+
(pl.col("forward_pass_id") >= args.start_id)
|
17
|
+
& (pl.col("forward_pass_id") <= args.end_id)
|
18
|
+
)
|
19
|
+
assert all(
|
20
|
+
c in df_target.columns
|
21
|
+
for c in ["rank", "forward_pass_id", "dump_index", "name"]
|
22
|
+
)
|
23
|
+
|
24
|
+
df_baseline = read_meta(args.baseline_path)
|
25
|
+
print("df_target", df_target)
|
26
|
+
print("df_baseline", df_baseline)
|
27
|
+
|
28
|
+
for row in df_target.iter_rows(named=True):
|
29
|
+
rows_baseline = df_baseline.filter(
|
30
|
+
(
|
31
|
+
pl.col("forward_pass_id")
|
32
|
+
== row["forward_pass_id"] - args.start_id + args.baseline_start_id
|
33
|
+
)
|
34
|
+
& functools.reduce(
|
35
|
+
lambda a, b: a & b,
|
36
|
+
[
|
37
|
+
pl.col(col) == row[col]
|
38
|
+
for col in row.keys()
|
39
|
+
if col not in ["forward_pass_id", "dump_index", "filename"]
|
40
|
+
],
|
41
|
+
)
|
42
|
+
)
|
43
|
+
assert len(rows_baseline) == 1, f"{rows_baseline=}"
|
44
|
+
row_baseline = rows_baseline.to_dicts()[0]
|
45
|
+
|
46
|
+
path_baseline = Path(args.baseline_path) / row_baseline["filename"]
|
47
|
+
path_target = Path(args.target_path) / row["filename"]
|
48
|
+
print(f"Check: target={str(path_target)} baseline={str(path_baseline)}")
|
49
|
+
check_tensor_pair(path_baseline=path_baseline, path_target=path_target)
|
50
|
+
print()
|
51
|
+
|
52
|
+
|
53
|
+
def read_meta(directory):
|
54
|
+
directory = Path(directory)
|
55
|
+
assert directory.is_dir(), f"{directory=} should be a directory"
|
56
|
+
|
57
|
+
rows = []
|
58
|
+
for p in directory.glob("*.pt"):
|
59
|
+
full_kwargs = {}
|
60
|
+
for kv in p.stem.split("___"):
|
61
|
+
k, v = kv.split("=")
|
62
|
+
full_kwargs[k] = v
|
63
|
+
rows.append(
|
64
|
+
{
|
65
|
+
"filename": str(p.name),
|
66
|
+
**full_kwargs,
|
67
|
+
}
|
68
|
+
)
|
69
|
+
|
70
|
+
df = pl.DataFrame(rows)
|
71
|
+
df = df.with_columns(
|
72
|
+
pl.col("forward_pass_id").cast(int),
|
73
|
+
pl.col("rank").cast(int),
|
74
|
+
)
|
75
|
+
return df
|
76
|
+
|
77
|
+
|
78
|
+
def check_tensor_pair(path_baseline, path_target):
|
79
|
+
x_baseline = torch.load(path_baseline, weights_only=True)
|
80
|
+
x_target = torch.load(path_target, weights_only=True)
|
81
|
+
|
82
|
+
print(
|
83
|
+
f"[shape] {x_baseline.shape} vs {x_target.shape}\t"
|
84
|
+
f"[dtype] {x_baseline.dtype} vs {x_target.dtype}"
|
85
|
+
)
|
86
|
+
|
87
|
+
if x_baseline.shape != x_target.shape:
|
88
|
+
print(f"❌ Shape mismatch")
|
89
|
+
return
|
90
|
+
|
91
|
+
raw_abs_diff = (x_target - x_baseline).abs()
|
92
|
+
|
93
|
+
max_abs_diff = raw_abs_diff.max().item()
|
94
|
+
mean_abs_diff = raw_abs_diff.mean().item()
|
95
|
+
rel_diff = _calc_rel_diff(x_target, x_baseline)
|
96
|
+
|
97
|
+
needs_print = max_abs_diff > 1e-3
|
98
|
+
|
99
|
+
print(
|
100
|
+
"\t".join(
|
101
|
+
f"{'❌' if value > 1e-3 else '✅'} {name}={value}"
|
102
|
+
for name, value in [
|
103
|
+
("rel_diff", rel_diff),
|
104
|
+
("max_abs_diff", max_abs_diff),
|
105
|
+
("mean_abs_diff", mean_abs_diff),
|
106
|
+
]
|
107
|
+
)
|
108
|
+
)
|
109
|
+
|
110
|
+
if needs_print:
|
111
|
+
print(f"x_baseline(sample)={get_truncated_value(x_baseline)}")
|
112
|
+
print(f"x_target(sample)={get_truncated_value(x_target)}")
|
113
|
+
|
114
|
+
|
115
|
+
# Copied from DeepGEMM
|
116
|
+
def _calc_rel_diff(x: torch.Tensor, y: torch.Tensor):
|
117
|
+
x, y = x.double(), y.double()
|
118
|
+
denominator = (x * x + y * y).sum()
|
119
|
+
sim = 2 * (x * y).sum() / denominator
|
120
|
+
return 1 - sim
|
121
|
+
|
122
|
+
|
123
|
+
if __name__ == "__main__":
|
124
|
+
parser = argparse.ArgumentParser()
|
125
|
+
parser.add_argument("--baseline-path", type=str)
|
126
|
+
parser.add_argument("--target-path", type=str)
|
127
|
+
parser.add_argument("--start-id", type=int, default=0)
|
128
|
+
parser.add_argument("--end-id", type=int, default=1000000)
|
129
|
+
parser.add_argument("--baseline-start-id", type=int, default=0)
|
130
|
+
args = parser.parse_args()
|
131
|
+
main(args)
|
@@ -0,0 +1,108 @@
|
|
1
|
+
import os
|
2
|
+
import time
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
import torch
|
7
|
+
import torch.distributed as dist
|
8
|
+
|
9
|
+
|
10
|
+
class _Dumper:
|
11
|
+
"""Utility to dump tensors, which can be useful when comparison checking models.
|
12
|
+
|
13
|
+
Example usage:
|
14
|
+
dumper.on_forward_pass_start()
|
15
|
+
dumper.dump("layer_start__hidden_states", hidden_states, layer_id=self.layer_id)
|
16
|
+
|
17
|
+
Import from non-SGLang system:
|
18
|
+
```
|
19
|
+
import sys
|
20
|
+
sys.path.append("/YOUR_PATH/sglang/python/sglang/srt/debug_utils")
|
21
|
+
from dumper import dumper
|
22
|
+
```
|
23
|
+
|
24
|
+
Related: `sglang.srt.debug_utils.dump_comparator` for dump comparison
|
25
|
+
"""
|
26
|
+
|
27
|
+
def __init__(self):
|
28
|
+
# Do not import `sglang` to make this file standalone
|
29
|
+
self._enable = bool(int(os.environ.get("SGLANG_DUMPER_ENABLE", "1")))
|
30
|
+
self._base_dir = Path(os.environ.get("SGLANG_DUMPER_DIR", "/tmp"))
|
31
|
+
self._enable_write_file = bool(
|
32
|
+
int(os.environ.get("SGLANG_DUMPER_WRITE_FILE", "1"))
|
33
|
+
)
|
34
|
+
self._partial_name: Optional[str] = None
|
35
|
+
self._dump_index = 0
|
36
|
+
self._forward_pass_id = 0
|
37
|
+
|
38
|
+
def on_forward_pass_start(self):
|
39
|
+
self._forward_pass_id += 1
|
40
|
+
print(
|
41
|
+
f"[Dumper] [{time.time()}] on_forward_pass_start id={self._forward_pass_id}"
|
42
|
+
)
|
43
|
+
|
44
|
+
def dump(self, name, value, **kwargs):
|
45
|
+
if not self._enable:
|
46
|
+
return
|
47
|
+
|
48
|
+
assert (
|
49
|
+
self._forward_pass_id >= 1
|
50
|
+
), "Do you forget to call `dumper.on_forward_pass_start()`?"
|
51
|
+
self._dump_index += 1
|
52
|
+
|
53
|
+
if self._partial_name is None:
|
54
|
+
self._partial_name = _get_partial_name()
|
55
|
+
|
56
|
+
rank = dist.get_rank()
|
57
|
+
full_kwargs = dict(
|
58
|
+
forward_pass_id=self._forward_pass_id,
|
59
|
+
rank=rank,
|
60
|
+
name=name,
|
61
|
+
dump_index=self._dump_index,
|
62
|
+
**kwargs,
|
63
|
+
)
|
64
|
+
full_filename = "___".join(f"{k}={v}" for k, v in full_kwargs.items()) + ".pt"
|
65
|
+
path = self._base_dir / f"sglang_dump_{self._partial_name}" / full_filename
|
66
|
+
|
67
|
+
sample_value = get_truncated_value(value)
|
68
|
+
|
69
|
+
print(
|
70
|
+
f"[Dumper] [{rank}, {time.time()}] {path} "
|
71
|
+
f"type={type(value)} "
|
72
|
+
f"shape={value.shape if isinstance(value, torch.Tensor) else None} "
|
73
|
+
f"dtype={value.dtype if isinstance(value, torch.Tensor) else None} "
|
74
|
+
f"sample_value={sample_value}"
|
75
|
+
)
|
76
|
+
|
77
|
+
if self._enable_write_file:
|
78
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
79
|
+
torch.save(value, str(path))
|
80
|
+
|
81
|
+
|
82
|
+
def _get_partial_name():
|
83
|
+
rank = dist.get_rank()
|
84
|
+
object_list = [str(time.time()) if rank == 0 else None]
|
85
|
+
dist.broadcast_object_list(object_list, device="cuda")
|
86
|
+
return object_list[0]
|
87
|
+
|
88
|
+
|
89
|
+
def get_truncated_value(value):
|
90
|
+
if value is None:
|
91
|
+
return None
|
92
|
+
|
93
|
+
if isinstance(value, tuple):
|
94
|
+
return [get_truncated_value(x) for x in value]
|
95
|
+
|
96
|
+
if not isinstance(value, torch.Tensor):
|
97
|
+
return None
|
98
|
+
|
99
|
+
if value.numel() < 200:
|
100
|
+
return value
|
101
|
+
|
102
|
+
slices = [
|
103
|
+
slice(0, 5) if dim_size > 200 else slice(None) for dim_size in value.shape
|
104
|
+
]
|
105
|
+
return value[tuple(slices)]
|
106
|
+
|
107
|
+
|
108
|
+
dumper = _Dumper()
|
@@ -0,0 +1,172 @@
|
|
1
|
+
import argparse
|
2
|
+
import json
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
import polars as pl
|
6
|
+
|
7
|
+
_DESCRIPTION = """Compare and find differences to benchmark outputs.
|
8
|
+
|
9
|
+
Supported inputs:
|
10
|
+
* The samples jsonl from `lm_eval --log_samples --output_path FOLDER_NAME`
|
11
|
+
* The output from `gsm8k/bench_sglang.py --raw-result-file FILE_NAME` (or mmlu)
|
12
|
+
"""
|
13
|
+
|
14
|
+
|
15
|
+
def main(args):
|
16
|
+
df_input = _transform_df_input(_compute_df_raw(args))
|
17
|
+
assert all(
|
18
|
+
c in df_input.columns
|
19
|
+
for c in ["category", "trial_index", "prompt_id", "prompt", "output", "correct"]
|
20
|
+
)
|
21
|
+
|
22
|
+
df_meta = _compute_df_meta(df_input)
|
23
|
+
|
24
|
+
df_correctness_per_trial = df_input.group_by(
|
25
|
+
"category", "trial_index", maintain_order=True
|
26
|
+
).agg(pl.col("correct").mean())
|
27
|
+
df_correctness_delta = (
|
28
|
+
df_meta.group_by("correctness_delta").len().sort("correctness_delta")
|
29
|
+
)
|
30
|
+
df_good_to_bad = df_meta.filter(pl.col("correctness_delta") < 0)
|
31
|
+
df_bad_to_good = df_meta.filter(pl.col("correctness_delta") > 0)
|
32
|
+
|
33
|
+
print(f"Dump output to {args.output_path}")
|
34
|
+
Path(args.output_path).write_text(
|
35
|
+
json.dumps(
|
36
|
+
dict(
|
37
|
+
df_meta=df_meta.to_dicts(),
|
38
|
+
df_good_to_bad=df_good_to_bad.to_dicts(),
|
39
|
+
df_bad_to_good=df_bad_to_good.to_dicts(),
|
40
|
+
)
|
41
|
+
)
|
42
|
+
)
|
43
|
+
|
44
|
+
if not args.disable_print_details:
|
45
|
+
with pl.Config(
|
46
|
+
fmt_str_lengths=10000,
|
47
|
+
tbl_cols=-1,
|
48
|
+
tbl_rows=-1,
|
49
|
+
tbl_width_chars=-1,
|
50
|
+
tbl_formatting="UTF8_FULL",
|
51
|
+
):
|
52
|
+
print("====== Correctness per trial ======")
|
53
|
+
print(df_correctness_per_trial)
|
54
|
+
|
55
|
+
print(
|
56
|
+
"====== Correctness Delta (-1.0 means all-right becomes all-wrong) ======"
|
57
|
+
)
|
58
|
+
print(df_correctness_delta)
|
59
|
+
|
60
|
+
for name, df in [
|
61
|
+
("Good->Bad", df_good_to_bad),
|
62
|
+
("Bad->Good", df_bad_to_good),
|
63
|
+
]:
|
64
|
+
print(f"====== Concrete Examples: {name} ======")
|
65
|
+
print(df)
|
66
|
+
|
67
|
+
|
68
|
+
def _compute_df_raw(args):
|
69
|
+
return pl.concat(
|
70
|
+
[
|
71
|
+
_read_df_raw(p, category=category, trial_index=i)
|
72
|
+
for category, paths in [
|
73
|
+
("baseline", args.baseline_path),
|
74
|
+
("target", args.target_path),
|
75
|
+
]
|
76
|
+
for i, p in enumerate(paths)
|
77
|
+
]
|
78
|
+
)
|
79
|
+
|
80
|
+
|
81
|
+
def _read_df_raw(path: str, category: str, trial_index: int):
|
82
|
+
return pl.read_ndjson(path).with_columns(
|
83
|
+
category=pl.lit(category), trial_index=trial_index
|
84
|
+
)
|
85
|
+
|
86
|
+
|
87
|
+
def _transform_df_input(df: pl.DataFrame):
|
88
|
+
if "doc_id" in df.columns:
|
89
|
+
print("Transform mode: lm_eval")
|
90
|
+
|
91
|
+
filter_names = df["filter"].unique(maintain_order=True).to_list()
|
92
|
+
if len(filter_names) > 1:
|
93
|
+
filter_name = filter_names[0]
|
94
|
+
print(f"Choose {filter_name=} among {filter_names}")
|
95
|
+
df = df.filter(pl.col("filter") == filter_name)
|
96
|
+
|
97
|
+
df = df.select(
|
98
|
+
pl.col("category"),
|
99
|
+
pl.col("trial_index"),
|
100
|
+
prompt_id=pl.col("doc_id"),
|
101
|
+
prompt=pl.col("arguments").struct.field("gen_args_0").struct.field("arg_0"),
|
102
|
+
output=pl.col("resps").list.get(0).list.get(0),
|
103
|
+
correct=pl.col("exact_match").cast(bool),
|
104
|
+
)
|
105
|
+
|
106
|
+
return df
|
107
|
+
elif "prompt_id" in df.columns:
|
108
|
+
print("Transform mode: SGLang bench")
|
109
|
+
return df
|
110
|
+
else:
|
111
|
+
raise Exception(f"Unknown data: {df.columns}")
|
112
|
+
|
113
|
+
|
114
|
+
def _compute_df_meta(df_input: pl.DataFrame):
|
115
|
+
df_input = df_input.sort("prompt_id", "category", "trial_index")
|
116
|
+
df_meta = pl.DataFrame(
|
117
|
+
[
|
118
|
+
_handle_one_prompt(df_one_prompt)
|
119
|
+
for df_one_prompt in df_input.partition_by("prompt_id", maintain_order=True)
|
120
|
+
]
|
121
|
+
)
|
122
|
+
df_meta = df_meta.with_columns(
|
123
|
+
correctness_delta=pl.col("correctness_target") - pl.col("correctness_baseline"),
|
124
|
+
)
|
125
|
+
df_meta = df_meta.sort("correctness_delta", "output_same_prefix_len")
|
126
|
+
return df_meta
|
127
|
+
|
128
|
+
|
129
|
+
def _handle_one_prompt(df_one_prompt: pl.DataFrame):
|
130
|
+
assert len(set(df_one_prompt["prompt"])) == 1
|
131
|
+
|
132
|
+
df_baseline = df_one_prompt.filter(pl.col("category") == "baseline")
|
133
|
+
df_target = df_one_prompt.filter(pl.col("category") == "target")
|
134
|
+
|
135
|
+
outputs_baseline = df_baseline["output"].to_list()
|
136
|
+
outputs_target = df_target["output"].to_list()
|
137
|
+
|
138
|
+
output_same_prefix_len = max(
|
139
|
+
_compute_str_prefix_len(output_baseline, output_target)
|
140
|
+
for output_baseline in outputs_baseline
|
141
|
+
for output_target in outputs_target
|
142
|
+
)
|
143
|
+
|
144
|
+
return dict(
|
145
|
+
prompt_id=df_one_prompt[0, "prompt_id"],
|
146
|
+
correctness_baseline=df_baseline["correct"].mean(),
|
147
|
+
correctness_target=df_target["correct"].mean(),
|
148
|
+
output_same_prefix_len=output_same_prefix_len,
|
149
|
+
prompt=df_one_prompt[0, "prompt"],
|
150
|
+
outputs_baseline=outputs_baseline,
|
151
|
+
outputs_target=outputs_target,
|
152
|
+
)
|
153
|
+
|
154
|
+
|
155
|
+
def _compute_str_prefix_len(a: str, b: str) -> int:
|
156
|
+
min_len = min(len(a), len(b))
|
157
|
+
for i in range(min_len):
|
158
|
+
if a[i] != b[i]:
|
159
|
+
return i
|
160
|
+
return min_len
|
161
|
+
|
162
|
+
|
163
|
+
if __name__ == "__main__":
|
164
|
+
parser = argparse.ArgumentParser(description=_DESCRIPTION)
|
165
|
+
parser.add_argument("--baseline-path", type=str, nargs="+")
|
166
|
+
parser.add_argument("--target-path", type=str, nargs="+")
|
167
|
+
parser.add_argument(
|
168
|
+
"--output-path", type=str, default="/tmp/text_comparator_output.json"
|
169
|
+
)
|
170
|
+
parser.add_argument("--disable-print-details", action="store_true")
|
171
|
+
args = parser.parse_args()
|
172
|
+
main(args)
|
@@ -1,10 +1,12 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import logging
|
4
|
+
from http import HTTPStatus
|
4
5
|
from typing import TYPE_CHECKING
|
5
6
|
|
6
7
|
import torch
|
7
8
|
|
9
|
+
from sglang.srt.disaggregation.utils import prepare_abort
|
8
10
|
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
|
9
11
|
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
|
10
12
|
|
@@ -102,7 +104,17 @@ class ScheduleBatchDisaggregationDecodeMixin:
|
|
102
104
|
self.output_ids.append(req.output_ids[-1])
|
103
105
|
self.tree_cache.cache_unfinished_req(req)
|
104
106
|
if req.grammar is not None:
|
105
|
-
|
107
|
+
# FIXME: this try-except block is for handling unexpected xgrammar issue.
|
108
|
+
try:
|
109
|
+
req.grammar.accept_token(req.output_ids[-1])
|
110
|
+
except ValueError as e:
|
111
|
+
# Grammar accept_token can raise ValueError if the token is not in the grammar.
|
112
|
+
# This can happen if the grammar is not set correctly or the token is invalid.
|
113
|
+
error_message = f"Grammar accept_token failed for req {req.rid} with token {req.output_ids[-1]}: {e}"
|
114
|
+
self.tree_cache.cache_finished_req(req)
|
115
|
+
prepare_abort(
|
116
|
+
req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
|
117
|
+
)
|
106
118
|
req.grammar.finished = req.finished()
|
107
119
|
self.output_ids = torch.tensor(self.output_ids, device=self.device)
|
108
120
|
|
@@ -992,6 +992,14 @@ class MooncakeKVSender(BaseKVSender):
|
|
992
992
|
)
|
993
993
|
raise KVTransferError(self.bootstrap_room, failure_reason)
|
994
994
|
|
995
|
+
def abort(self):
|
996
|
+
self.kv_mgr.record_failure(
|
997
|
+
self.bootstrap_room,
|
998
|
+
"Aborted by AbortReq.",
|
999
|
+
)
|
1000
|
+
# Explicitly set the status to failure since this request has been aborted
|
1001
|
+
self.conclude_state = KVPoll.Failed
|
1002
|
+
|
995
1003
|
|
996
1004
|
class MooncakeKVReceiver(BaseKVReceiver):
|
997
1005
|
_ctx = zmq.Context()
|
@@ -1305,6 +1313,14 @@ class MooncakeKVReceiver(BaseKVReceiver):
|
|
1305
1313
|
)
|
1306
1314
|
raise KVTransferError(self.bootstrap_room, failure_reason)
|
1307
1315
|
|
1316
|
+
def abort(self):
|
1317
|
+
self.kv_mgr.record_failure(
|
1318
|
+
self.bootstrap_room,
|
1319
|
+
"Aborted by AbortReq.",
|
1320
|
+
)
|
1321
|
+
# Explicitly set the status to failure since this request has been aborted
|
1322
|
+
self.conclude_state = KVPoll.Failed
|
1323
|
+
|
1308
1324
|
|
1309
1325
|
class MooncakeKVBootstrapServer(BaseKVBootstrapServer):
|
1310
1326
|
def __init__(self, port: int):
|
@@ -425,7 +425,19 @@ class SchedulerDisaggregationPrefillMixin:
|
|
425
425
|
self.send_kv_chunk(req, last_chunk=True)
|
426
426
|
|
427
427
|
if req.grammar is not None:
|
428
|
-
|
428
|
+
# FIXME: this try-except block is for handling unexpected xgrammar issue.
|
429
|
+
try:
|
430
|
+
req.grammar.accept_token(next_token_id)
|
431
|
+
except ValueError as e:
|
432
|
+
# Grammar accept_token can raise ValueError if the token is not in the grammar.
|
433
|
+
# This can happen if the grammar is not set correctly or the token is invalid.
|
434
|
+
error_message = f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
|
435
|
+
self.tree_cache.cache_finished_req(req)
|
436
|
+
prepare_abort(
|
437
|
+
req,
|
438
|
+
error_message,
|
439
|
+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
440
|
+
)
|
429
441
|
req.grammar.finished = req.finished()
|
430
442
|
else:
|
431
443
|
# being chunked reqs' prefill is not finished
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -640,7 +640,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
640
640
|
if server_args.attention_backend == "flashinfer":
|
641
641
|
assert_pkg_version(
|
642
642
|
"flashinfer_python",
|
643
|
-
"0.2.
|
643
|
+
"0.2.9rc2",
|
644
644
|
"Please uninstall the old version and "
|
645
645
|
"reinstall the latest version by following the instructions "
|
646
646
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -765,7 +765,9 @@ def _launch_subprocesses(
|
|
765
765
|
# When using `Engine` as a Python API, we don't want to block here.
|
766
766
|
return None, None, None
|
767
767
|
|
768
|
-
launch_dummy_health_check_server(
|
768
|
+
launch_dummy_health_check_server(
|
769
|
+
server_args.host, server_args.port, server_args.enable_metrics
|
770
|
+
)
|
769
771
|
|
770
772
|
for proc in scheduler_procs:
|
771
773
|
proc.join()
|