sglang 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +3 -2
- sglang/compile_deep_gemm.py +136 -0
- sglang/lang/backend/openai.py +5 -1
- sglang/lang/backend/runtime_endpoint.py +5 -1
- sglang/srt/configs/model_config.py +4 -1
- sglang/srt/constrained/xgrammar_backend.py +1 -0
- sglang/srt/disaggregation/decode.py +43 -0
- sglang/srt/disaggregation/mini_lb.py +69 -8
- sglang/srt/disaggregation/mooncake/conn.py +1 -1
- sglang/srt/disaggregation/nixl/__init__.py +1 -0
- sglang/srt/disaggregation/nixl/conn.py +622 -0
- sglang/srt/disaggregation/prefill.py +100 -16
- sglang/srt/disaggregation/utils.py +17 -0
- sglang/srt/entrypoints/engine.py +4 -0
- sglang/srt/entrypoints/http_server.py +3 -7
- sglang/srt/function_call_parser.py +60 -0
- sglang/srt/layers/activation.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +781 -150
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -5
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
- sglang/srt/layers/dp_attention.py +1 -1
- sglang/srt/layers/layernorm.py +19 -4
- sglang/srt/layers/moe/ep_moe/layer.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
- sglang/srt/layers/quantization/deep_gemm.py +378 -0
- sglang/srt/layers/quantization/fp8_kernel.py +7 -38
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/gptq.py +13 -7
- sglang/srt/layers/quantization/modelopt_quant.py +2 -2
- sglang/srt/layers/quantization/w8a8_int8.py +3 -3
- sglang/srt/layers/rotary_embedding.py +6 -6
- sglang/srt/layers/sampler.py +2 -2
- sglang/srt/managers/data_parallel_controller.py +7 -1
- sglang/srt/managers/io_struct.py +14 -3
- sglang/srt/managers/schedule_batch.py +13 -0
- sglang/srt/managers/scheduler.py +16 -6
- sglang/srt/managers/tokenizer_manager.py +115 -29
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +40 -32
- sglang/srt/mem_cache/memory_pool.py +31 -13
- sglang/srt/model_executor/cuda_graph_runner.py +13 -8
- sglang/srt/model_executor/model_runner.py +19 -4
- sglang/srt/models/deepseek_v2.py +9 -6
- sglang/srt/models/minicpm3.py +2 -2
- sglang/srt/models/minicpmo.py +17 -6
- sglang/srt/openai_api/adapter.py +71 -4
- sglang/srt/openai_api/protocol.py +6 -1
- sglang/srt/server_args.py +52 -40
- sglang/srt/speculative/build_eagle_tree.py +2 -2
- sglang/srt/speculative/eagle_utils.py +2 -2
- sglang/srt/speculative/eagle_worker.py +2 -7
- sglang/srt/utils.py +46 -5
- sglang/test/test_utils.py +3 -1
- sglang/version.py +1 -1
- {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +3 -3
- {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +62 -57
- {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +0 -0
- {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0
@@ -42,6 +42,10 @@ from sglang.srt.layers.dp_attention import (
|
|
42
42
|
)
|
43
43
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
44
44
|
from sglang.srt.layers.quantization import monkey_patch_isinstance_for_vllm_base_layer
|
45
|
+
from sglang.srt.layers.quantization.deep_gemm import (
|
46
|
+
_ENABLE_JIT_DEEPGEMM,
|
47
|
+
update_deep_gemm_config,
|
48
|
+
)
|
45
49
|
from sglang.srt.layers.sampler import Sampler
|
46
50
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
|
47
51
|
from sglang.srt.lora.lora_manager import LoRAManager
|
@@ -169,6 +173,10 @@ class ModelRunner:
|
|
169
173
|
# Get memory before model loading
|
170
174
|
min_per_gpu_memory = self.init_torch_distributed()
|
171
175
|
|
176
|
+
# Update deep gemm configure
|
177
|
+
if _ENABLE_JIT_DEEPGEMM:
|
178
|
+
update_deep_gemm_config(gpu_id, server_args)
|
179
|
+
|
172
180
|
# If it is a draft model tp_group can be different.
|
173
181
|
self.initialize(min_per_gpu_memory)
|
174
182
|
|
@@ -221,7 +229,16 @@ class ModelRunner:
|
|
221
229
|
server_args = self.server_args
|
222
230
|
|
223
231
|
if server_args.attention_backend is None:
|
224
|
-
|
232
|
+
"""
|
233
|
+
We auto select the fastest attention backend according to the current offering
|
234
|
+
1. Models with MHA Architecture (e.g: Llama, QWen)
|
235
|
+
1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
|
236
|
+
1.2 In other cases, we will use flashinfer if available, otherwise use triton.
|
237
|
+
2. Models with MLA Architecture and using FA3
|
238
|
+
2.1 We will use FA3 backend on hopper.
|
239
|
+
2.2 Otherwise, we will use triton backend.
|
240
|
+
"""
|
241
|
+
|
225
242
|
if not self.use_mla_backend:
|
226
243
|
if (
|
227
244
|
is_hopper_with_cuda_12_3()
|
@@ -234,9 +251,7 @@ class ModelRunner:
|
|
234
251
|
"flashinfer" if is_flashinfer_available() else "triton"
|
235
252
|
)
|
236
253
|
else:
|
237
|
-
if is_hopper_with_cuda_12_3()
|
238
|
-
server_args
|
239
|
-
):
|
254
|
+
if is_hopper_with_cuda_12_3():
|
240
255
|
server_args.attention_backend = "fa3"
|
241
256
|
else:
|
242
257
|
server_args.attention_backend = "triton"
|
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -57,8 +57,8 @@ from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
|
|
57
57
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
58
58
|
from sglang.srt.layers.moe.topk import select_experts
|
59
59
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
60
|
+
from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
|
60
61
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
61
|
-
_enable_jit_deepgemm_bmm,
|
62
62
|
per_tensor_quant_mla_deep_gemm_masked_fp8,
|
63
63
|
per_tensor_quant_mla_fp8,
|
64
64
|
)
|
@@ -86,8 +86,11 @@ _is_hip = is_hip()
|
|
86
86
|
_is_cuda = is_cuda()
|
87
87
|
|
88
88
|
if _is_cuda:
|
89
|
-
from deep_gemm import m_grouped_gemm_fp8_fp8_bf16_nt_masked
|
90
89
|
from sgl_kernel import awq_dequantize, bmm_fp8, merge_state_v2
|
90
|
+
|
91
|
+
from sglang.srt.layers.quantization.deep_gemm import (
|
92
|
+
grouped_gemm_nt_f8f8bf16_masked as deep_gemm_grouped_gemm_nt_f8f8bf16_masked,
|
93
|
+
)
|
91
94
|
else:
|
92
95
|
from vllm._custom_ops import awq_dequantize
|
93
96
|
|
@@ -702,7 +705,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
702
705
|
q_nope_out = q_nope.new_empty(
|
703
706
|
(self.num_local_heads, aligned_m, self.kv_lora_rank)
|
704
707
|
)
|
705
|
-
|
708
|
+
deep_gemm_grouped_gemm_nt_f8f8bf16_masked(
|
706
709
|
(q_nope_val, q_nope_scale),
|
707
710
|
(self.w_kc, self.w_scale_k),
|
708
711
|
q_nope_out,
|
@@ -751,7 +754,7 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
751
754
|
attn_bmm_output = attn_output.new_empty(
|
752
755
|
(self.num_local_heads, aligned_m, self.v_head_dim)
|
753
756
|
)
|
754
|
-
|
757
|
+
deep_gemm_grouped_gemm_nt_f8f8bf16_masked(
|
755
758
|
(attn_output_val, attn_output_scale),
|
756
759
|
(self.w_vc, self.w_scale_v),
|
757
760
|
attn_bmm_output,
|
@@ -1520,7 +1523,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
1520
1523
|
|
1521
1524
|
if (
|
1522
1525
|
_is_cuda
|
1523
|
-
and
|
1526
|
+
and _ENABLE_JIT_DEEPGEMM
|
1524
1527
|
and weight_block_size[0] == 128
|
1525
1528
|
and weight_block_size[1] == 128
|
1526
1529
|
and model_dtype == torch.bfloat16
|
@@ -1628,7 +1631,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
1628
1631
|
f"mlp.experts."
|
1629
1632
|
f"{self.config.n_routed_experts + num_repeat}"
|
1630
1633
|
f".{suffix}",
|
1631
|
-
weights_dict[shared_expert_weight_name]
|
1634
|
+
weights_dict[shared_expert_weight_name],
|
1632
1635
|
)
|
1633
1636
|
)
|
1634
1637
|
names_to_remove += [shared_expert_weight_name]
|
sglang/srt/models/minicpm3.py
CHANGED
@@ -40,9 +40,9 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
40
40
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
41
41
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
42
42
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
43
|
-
from sglang.srt.utils import add_prefix,
|
43
|
+
from sglang.srt.utils import add_prefix, is_cuda
|
44
44
|
|
45
|
-
if
|
45
|
+
if is_cuda():
|
46
46
|
from sgl_kernel import bmm_fp8
|
47
47
|
|
48
48
|
|
sglang/srt/models/minicpmo.py
CHANGED
@@ -25,7 +25,7 @@ import torch.nn.functional as F
|
|
25
25
|
import torch.nn.utils.parametrize as P
|
26
26
|
import torch.types
|
27
27
|
from torch import nn
|
28
|
-
from torch.nn.utils import
|
28
|
+
from torch.nn.utils import parametrizations
|
29
29
|
from tqdm import tqdm
|
30
30
|
from transformers import LlamaConfig, LlamaModel, PretrainedConfig, PreTrainedModel
|
31
31
|
from transformers.activations import ACT2FN
|
@@ -585,7 +585,7 @@ class ConditionalChatTTS(PreTrainedModel):
|
|
585
585
|
self.emb_text = nn.Embedding(config.num_text_tokens, config.hidden_size)
|
586
586
|
self.head_code = nn.ModuleList(
|
587
587
|
[
|
588
|
-
weight_norm(
|
588
|
+
parametrizations.weight_norm(
|
589
589
|
nn.Linear(config.hidden_size, config.num_audio_tokens, bias=False),
|
590
590
|
name="weight",
|
591
591
|
)
|
@@ -1859,11 +1859,22 @@ class MiniCPMO(MiniCPMBaseModel):
|
|
1859
1859
|
# the checkpoint. Skip them.
|
1860
1860
|
continue
|
1861
1861
|
|
1862
|
-
#
|
1862
|
+
# For weight_norm parametrization, handle both old and new formats
|
1863
1863
|
if self.config.init_tts and "tts" in name:
|
1864
|
-
|
1865
|
-
|
1866
|
-
|
1864
|
+
# Handle loading from older checkpoints with weight_g/weight_v format
|
1865
|
+
if ".weight_g" in name or ".weight_v" in name:
|
1866
|
+
name = name.replace(
|
1867
|
+
".weight_g", ".parametrizations.weight.original0"
|
1868
|
+
)
|
1869
|
+
name = name.replace(
|
1870
|
+
".weight_v", ".parametrizations.weight.original1"
|
1871
|
+
)
|
1872
|
+
elif ".weight" in name and name not in params_dict:
|
1873
|
+
param_name = name.replace(
|
1874
|
+
".weight", ".parametrizations.weight.original0"
|
1875
|
+
)
|
1876
|
+
if param_name in params_dict:
|
1877
|
+
name = param_name
|
1867
1878
|
|
1868
1879
|
# adapt to VisionAttention
|
1869
1880
|
if "vpm" in name:
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -938,6 +938,35 @@ def v1_chat_generate_request(
|
|
938
938
|
|
939
939
|
if chat_template_name is None:
|
940
940
|
openai_compatible_messages = []
|
941
|
+
if (
|
942
|
+
tools
|
943
|
+
and tokenizer_manager.server_args.tool_call_parser == "deepseekv3"
|
944
|
+
):
|
945
|
+
# add function call prompt to deepseekv3
|
946
|
+
openai_compatible_messages.append(
|
947
|
+
{
|
948
|
+
"role": "system",
|
949
|
+
"content": """You are a helpful Assistant.
|
950
|
+
## Tools
|
951
|
+
### Function
|
952
|
+
You have the following functions available:
|
953
|
+
"""
|
954
|
+
+ "".join(
|
955
|
+
[
|
956
|
+
f"""
|
957
|
+
- `{tool['name']}`:
|
958
|
+
```json
|
959
|
+
{json.dumps(tool)}
|
960
|
+
```
|
961
|
+
"""
|
962
|
+
for tool in tools
|
963
|
+
]
|
964
|
+
),
|
965
|
+
}
|
966
|
+
)
|
967
|
+
# TODO fix the compatible issues with xgrammar
|
968
|
+
strict_tag = None
|
969
|
+
|
941
970
|
for message in request.messages:
|
942
971
|
if isinstance(message.content, str):
|
943
972
|
openai_compatible_messages.append(
|
@@ -950,9 +979,16 @@ def v1_chat_generate_request(
|
|
950
979
|
openai_compatible_messages.append(
|
951
980
|
{"role": message.role, "content": content["text"]}
|
952
981
|
)
|
953
|
-
if
|
954
|
-
|
955
|
-
|
982
|
+
if (
|
983
|
+
openai_compatible_messages
|
984
|
+
and openai_compatible_messages[-1]["role"] == "assistant"
|
985
|
+
):
|
986
|
+
if request.continue_final_message:
|
987
|
+
# Remove the final assistant message so its content can be continued.
|
988
|
+
assistant_prefix = openai_compatible_messages[-1]["content"]
|
989
|
+
openai_compatible_messages = openai_compatible_messages[:-1]
|
990
|
+
else:
|
991
|
+
assistant_prefix = None
|
956
992
|
else:
|
957
993
|
assistant_prefix = None
|
958
994
|
|
@@ -991,7 +1027,33 @@ def v1_chat_generate_request(
|
|
991
1027
|
modalities = []
|
992
1028
|
else:
|
993
1029
|
conv = generate_chat_conv(request, chat_template_name)
|
994
|
-
|
1030
|
+
# If we should continue the final assistant message, adjust the conversation.
|
1031
|
+
if (
|
1032
|
+
request.continue_final_message
|
1033
|
+
and request.messages
|
1034
|
+
and request.messages[-1].role == "assistant"
|
1035
|
+
):
|
1036
|
+
# Remove the auto-added blank assistant turn, if present.
|
1037
|
+
if conv.messages and conv.messages[-1][1] is None:
|
1038
|
+
conv.messages.pop()
|
1039
|
+
# Rebuild the prompt from the conversation.
|
1040
|
+
prompt = conv.get_prompt()
|
1041
|
+
# Strip any trailing stop tokens or separators that indicate end-of-assistant.
|
1042
|
+
if isinstance(conv.stop_str, list):
|
1043
|
+
for stop_token in conv.stop_str:
|
1044
|
+
if prompt.endswith(stop_token):
|
1045
|
+
prompt = prompt[: -len(stop_token)]
|
1046
|
+
elif isinstance(conv.stop_str, str) and prompt.endswith(
|
1047
|
+
conv.stop_str
|
1048
|
+
):
|
1049
|
+
prompt = prompt[: -len(conv.stop_str)]
|
1050
|
+
if conv.sep and prompt.endswith(conv.sep):
|
1051
|
+
prompt = prompt[: -len(conv.sep)]
|
1052
|
+
if getattr(conv, "sep2", None) and prompt.endswith(conv.sep2):
|
1053
|
+
prompt = prompt[: -len(conv.sep2)]
|
1054
|
+
else:
|
1055
|
+
prompt = conv.get_prompt()
|
1056
|
+
|
995
1057
|
image_data = conv.image_data
|
996
1058
|
audio_data = conv.audio_data
|
997
1059
|
modalities = conv.modalities
|
@@ -1003,6 +1065,7 @@ def v1_chat_generate_request(
|
|
1003
1065
|
else:
|
1004
1066
|
stop.extend(request.stop)
|
1005
1067
|
prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
|
1068
|
+
|
1006
1069
|
else:
|
1007
1070
|
# Use the raw prompt and stop strings if the messages is already a string.
|
1008
1071
|
prompt_ids = request.messages
|
@@ -1042,6 +1105,8 @@ def v1_chat_generate_request(
|
|
1042
1105
|
sampling_params["json_schema"] = convert_json_schema_to_str(
|
1043
1106
|
request.response_format.json_schema.schema_
|
1044
1107
|
)
|
1108
|
+
elif request.response_format and request.response_format.type == "json_object":
|
1109
|
+
sampling_params["json_schema"] = '{"type": "object"}'
|
1045
1110
|
elif (
|
1046
1111
|
request.response_format and request.response_format.type == "structural_tag"
|
1047
1112
|
):
|
@@ -1109,6 +1174,8 @@ def v1_chat_generate_request(
|
|
1109
1174
|
rid=request_ids,
|
1110
1175
|
modalities=modalities_list,
|
1111
1176
|
lora_path=lora_paths,
|
1177
|
+
bootstrap_host=all_requests[0].bootstrap_host,
|
1178
|
+
bootstrap_room=all_requests[0].bootstrap_room,
|
1112
1179
|
)
|
1113
1180
|
|
1114
1181
|
return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
|
@@ -252,7 +252,7 @@ ChatCompletionMessageContentPart = Union[
|
|
252
252
|
|
253
253
|
class ChatCompletionMessageGenericParam(BaseModel):
|
254
254
|
role: Literal["system", "assistant", "tool"]
|
255
|
-
content: Union[str, List[ChatCompletionMessageContentTextPart]]
|
255
|
+
content: Union[str, List[ChatCompletionMessageContentTextPart], None]
|
256
256
|
|
257
257
|
|
258
258
|
class ChatCompletionMessageUserParam(BaseModel):
|
@@ -355,12 +355,17 @@ class ChatCompletionRequest(BaseModel):
|
|
355
355
|
stop_token_ids: Optional[List[int]] = None
|
356
356
|
no_stop_trim: bool = False
|
357
357
|
ignore_eos: bool = False
|
358
|
+
continue_final_message: bool = False
|
358
359
|
skip_special_tokens: bool = True
|
359
360
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
360
361
|
session_params: Optional[Dict] = None
|
361
362
|
separate_reasoning: bool = True
|
362
363
|
stream_reasoning: bool = True
|
363
364
|
|
365
|
+
# For PD disaggregation
|
366
|
+
bootstrap_host: Optional[str] = None
|
367
|
+
bootstrap_room: Optional[int] = None
|
368
|
+
|
364
369
|
|
365
370
|
class FunctionResponse(BaseModel):
|
366
371
|
"""Function response."""
|
sglang/srt/server_args.py
CHANGED
@@ -26,11 +26,8 @@ from sglang.srt.hf_transformers_utils import check_gguf_file
|
|
26
26
|
from sglang.srt.reasoning_parser import ReasoningParser
|
27
27
|
from sglang.srt.utils import (
|
28
28
|
configure_ipv6,
|
29
|
-
get_amdgpu_memory_capacity,
|
30
29
|
get_device,
|
31
|
-
|
32
|
-
get_nvgpu_memory_capacity,
|
33
|
-
is_cuda,
|
30
|
+
get_device_memory_capacity,
|
34
31
|
is_flashinfer_available,
|
35
32
|
is_hip,
|
36
33
|
is_port_available,
|
@@ -49,6 +46,7 @@ class ServerArgs:
|
|
49
46
|
tokenizer_path: Optional[str] = None
|
50
47
|
tokenizer_mode: str = "auto"
|
51
48
|
skip_tokenizer_init: bool = False
|
49
|
+
enable_tokenizer_batch_encode: bool = False
|
52
50
|
load_format: str = "auto"
|
53
51
|
trust_remote_code: bool = False
|
54
52
|
dtype: str = "auto"
|
@@ -179,6 +177,8 @@ class ServerArgs:
|
|
179
177
|
tool_call_parser: Optional[str] = None
|
180
178
|
enable_hierarchical_cache: bool = False
|
181
179
|
hicache_ratio: float = 2.0
|
180
|
+
hicache_size: int = 0
|
181
|
+
hicache_write_policy: str = "write_through_selective"
|
182
182
|
flashinfer_mla_disable_ragged: bool = False
|
183
183
|
warmups: Optional[str] = None
|
184
184
|
moe_dense_tp_size: Optional[int] = None
|
@@ -218,28 +218,24 @@ class ServerArgs:
|
|
218
218
|
if self.random_seed is None:
|
219
219
|
self.random_seed = random.randint(0, 1 << 30)
|
220
220
|
|
221
|
-
|
222
|
-
gpu_mem = get_nvgpu_memory_capacity()
|
223
|
-
elif is_hip():
|
224
|
-
gpu_mem = get_amdgpu_memory_capacity()
|
225
|
-
elif self.device == "hpu":
|
226
|
-
gpu_mem = get_hpu_memory_capacity()
|
227
|
-
else:
|
228
|
-
# GPU memory is not known yet or no GPU is available.
|
229
|
-
gpu_mem = None
|
221
|
+
gpu_mem = get_device_memory_capacity(self.device)
|
230
222
|
|
231
223
|
# Set mem fraction static, which depends on the tensor parallelism size
|
232
224
|
if self.mem_fraction_static is None:
|
233
|
-
if
|
234
|
-
self.
|
235
|
-
|
236
|
-
self.
|
237
|
-
|
238
|
-
self.
|
239
|
-
|
240
|
-
self.
|
225
|
+
if gpu_mem <= 81920:
|
226
|
+
if self.tp_size >= 16:
|
227
|
+
self.mem_fraction_static = 0.79
|
228
|
+
elif self.tp_size >= 8:
|
229
|
+
self.mem_fraction_static = 0.81
|
230
|
+
elif self.tp_size >= 4:
|
231
|
+
self.mem_fraction_static = 0.85
|
232
|
+
elif self.tp_size >= 2:
|
233
|
+
self.mem_fraction_static = 0.87
|
234
|
+
else:
|
235
|
+
self.mem_fraction_static = 0.88
|
241
236
|
else:
|
242
|
-
|
237
|
+
# FIXME: more fine grained auto-selection polices
|
238
|
+
self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
|
243
239
|
|
244
240
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
245
241
|
if self.chunked_prefill_size is None:
|
@@ -268,8 +264,6 @@ class ServerArgs:
|
|
268
264
|
self.cuda_graph_max_bs = 8
|
269
265
|
else:
|
270
266
|
self.cuda_graph_max_bs = 80
|
271
|
-
else:
|
272
|
-
self.cuda_graph_max_bs = 160
|
273
267
|
|
274
268
|
# Set kernel backends for hpu device
|
275
269
|
if self.device == "hpu":
|
@@ -291,13 +285,6 @@ class ServerArgs:
|
|
291
285
|
if self.grammar_backend is None:
|
292
286
|
self.grammar_backend = "xgrammar"
|
293
287
|
|
294
|
-
# Expert parallelism
|
295
|
-
if self.enable_ep_moe:
|
296
|
-
self.ep_size = self.tp_size
|
297
|
-
logger.info(
|
298
|
-
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
299
|
-
)
|
300
|
-
|
301
288
|
self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
|
302
289
|
|
303
290
|
# Data parallelism attention
|
@@ -358,7 +345,18 @@ class ServerArgs:
|
|
358
345
|
|
359
346
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
360
347
|
self.speculative_eagle_topk = 1
|
361
|
-
logger.info(
|
348
|
+
logger.info(
|
349
|
+
"speculative_eagle_topk is adjusted to 1 when page_size > 1"
|
350
|
+
)
|
351
|
+
|
352
|
+
if (
|
353
|
+
self.speculative_eagle_topk == 1
|
354
|
+
and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
|
355
|
+
):
|
356
|
+
logger.info(
|
357
|
+
"speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
|
358
|
+
)
|
359
|
+
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
362
360
|
|
363
361
|
# The token generated from the verify step is counted.
|
364
362
|
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
@@ -380,14 +378,10 @@ class ServerArgs:
|
|
380
378
|
# PD disaggregation
|
381
379
|
if self.disaggregation_mode == "prefill":
|
382
380
|
self.disable_cuda_graph = True
|
383
|
-
logger.warning("
|
384
|
-
self.disable_overlap_schedule = True
|
385
|
-
logger.warning("Overlap scheduler is disabled for prefill server")
|
381
|
+
logger.warning("Cuda graph is disabled for prefill server")
|
386
382
|
elif self.disaggregation_mode == "decode":
|
387
383
|
self.disable_radix_cache = True
|
388
|
-
logger.warning("
|
389
|
-
self.disable_overlap_schedule = True
|
390
|
-
logger.warning("Overlap scheduler is disabled for decode server")
|
384
|
+
logger.warning("KV cache is forced as chunk cache for decode server")
|
391
385
|
|
392
386
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
393
387
|
"1" if self.enable_torch_compile else "0"
|
@@ -432,6 +426,11 @@ class ServerArgs:
|
|
432
426
|
action="store_true",
|
433
427
|
help="If set, skip init tokenizer and pass input_ids in generate request",
|
434
428
|
)
|
429
|
+
parser.add_argument(
|
430
|
+
"--enable-tokenizer-batch-encode",
|
431
|
+
action="store_true",
|
432
|
+
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
433
|
+
)
|
435
434
|
parser.add_argument(
|
436
435
|
"--load-format",
|
437
436
|
type=str,
|
@@ -1087,7 +1086,7 @@ class ServerArgs:
|
|
1087
1086
|
parser.add_argument(
|
1088
1087
|
"--tool-call-parser",
|
1089
1088
|
type=str,
|
1090
|
-
choices=["qwen25", "mistral", "llama3"],
|
1089
|
+
choices=["qwen25", "mistral", "llama3", "deepseekv3"],
|
1091
1090
|
default=ServerArgs.tool_call_parser,
|
1092
1091
|
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
|
1093
1092
|
)
|
@@ -1099,10 +1098,22 @@ class ServerArgs:
|
|
1099
1098
|
parser.add_argument(
|
1100
1099
|
"--hicache-ratio",
|
1101
1100
|
type=float,
|
1102
|
-
required=False,
|
1103
1101
|
default=ServerArgs.hicache_ratio,
|
1104
1102
|
help="The ratio of the size of host KV cache memory pool to the size of device pool.",
|
1105
1103
|
)
|
1104
|
+
parser.add_argument(
|
1105
|
+
"--hicache-size",
|
1106
|
+
type=int,
|
1107
|
+
default=ServerArgs.hicache_size,
|
1108
|
+
help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
|
1109
|
+
)
|
1110
|
+
parser.add_argument(
|
1111
|
+
"--hicache-write-policy",
|
1112
|
+
type=str,
|
1113
|
+
choices=["write_back", "write_through", "write_through_selective"],
|
1114
|
+
default=ServerArgs.hicache_write_policy,
|
1115
|
+
help="The write policy of hierarchical cache.",
|
1116
|
+
)
|
1106
1117
|
parser.add_argument(
|
1107
1118
|
"--enable-deepep-moe",
|
1108
1119
|
action="store_true",
|
@@ -1187,6 +1198,7 @@ class ServerArgs:
|
|
1187
1198
|
"--disaggregation-transfer-backend",
|
1188
1199
|
type=str,
|
1189
1200
|
default=ServerArgs.disaggregation_transfer_backend,
|
1201
|
+
choices=["mooncake", "nixl"],
|
1190
1202
|
help="The backend for disaggregation transfer. Default is mooncake.",
|
1191
1203
|
)
|
1192
1204
|
parser.add_argument(
|
@@ -4,9 +4,9 @@ from typing import List
|
|
4
4
|
|
5
5
|
import torch
|
6
6
|
|
7
|
-
from sglang.srt.utils import
|
7
|
+
from sglang.srt.utils import is_cuda, is_hip
|
8
8
|
|
9
|
-
if
|
9
|
+
if is_cuda() or is_hip():
|
10
10
|
from sgl_kernel import (
|
11
11
|
build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
|
12
12
|
)
|
@@ -19,9 +19,9 @@ from sglang.srt.managers.schedule_batch import (
|
|
19
19
|
from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
|
20
20
|
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
|
21
21
|
from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
|
22
|
-
from sglang.srt.utils import fast_topk,
|
22
|
+
from sglang.srt.utils import fast_topk, is_cuda, is_hip, next_power_of_2
|
23
23
|
|
24
|
-
if
|
24
|
+
if is_cuda():
|
25
25
|
from sgl_kernel import (
|
26
26
|
top_k_renorm_prob,
|
27
27
|
top_p_renorm_prob,
|
@@ -34,14 +34,9 @@ from sglang.srt.speculative.eagle_utils import (
|
|
34
34
|
select_top_k_tokens,
|
35
35
|
)
|
36
36
|
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
37
|
-
from sglang.srt.utils import
|
38
|
-
empty_context,
|
39
|
-
fast_topk,
|
40
|
-
get_available_gpu_memory,
|
41
|
-
is_cuda_available,
|
42
|
-
)
|
37
|
+
from sglang.srt.utils import empty_context, fast_topk, get_available_gpu_memory, is_cuda
|
43
38
|
|
44
|
-
if
|
39
|
+
if is_cuda():
|
45
40
|
from sgl_kernel import segment_packbits
|
46
41
|
|
47
42
|
logger = logging.getLogger(__name__)
|
sglang/srt/utils.py
CHANGED
@@ -78,10 +78,34 @@ time_infos = {}
|
|
78
78
|
|
79
79
|
HIP_FP8_E4M3_FNUZ_MAX = 224.0
|
80
80
|
|
81
|
+
_warned_bool_env_var_keys = set()
|
82
|
+
|
81
83
|
|
82
84
|
def get_bool_env_var(name: str, default: str = "false") -> bool:
|
83
85
|
value = os.getenv(name, default)
|
84
|
-
|
86
|
+
value = value.lower()
|
87
|
+
|
88
|
+
truthy_values = ("true", "1")
|
89
|
+
falsy_values = ("false", "0")
|
90
|
+
|
91
|
+
if (value not in truthy_values) and (value not in falsy_values):
|
92
|
+
if value not in _warned_bool_env_var_keys:
|
93
|
+
logger.warning(
|
94
|
+
f"get_bool_env_var({name}) see non-understandable value={value} and treat as false"
|
95
|
+
)
|
96
|
+
_warned_bool_env_var_keys.add(value)
|
97
|
+
|
98
|
+
return value in truthy_values
|
99
|
+
|
100
|
+
|
101
|
+
def get_int_env_var(name: str, default: int = 0) -> int:
|
102
|
+
value = os.getenv(name)
|
103
|
+
if value is None or not value.strip():
|
104
|
+
return default
|
105
|
+
try:
|
106
|
+
return int(value)
|
107
|
+
except ValueError:
|
108
|
+
return default
|
85
109
|
|
86
110
|
|
87
111
|
# https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
|
@@ -130,10 +154,6 @@ def is_flashinfer_available():
|
|
130
154
|
return importlib.util.find_spec("flashinfer") is not None and is_cuda()
|
131
155
|
|
132
156
|
|
133
|
-
def is_cuda_available():
|
134
|
-
return is_cuda()
|
135
|
-
|
136
|
-
|
137
157
|
_ENABLE_TORCH_INFERENCE_MODE = get_bool_env_var(
|
138
158
|
"SGLANG_ENABLE_TORCH_INFERENCE_MODE", "false"
|
139
159
|
)
|
@@ -774,6 +794,8 @@ def add_api_key_middleware(app, api_key: str):
|
|
774
794
|
return await call_next(request)
|
775
795
|
if request.url.path.startswith("/health"):
|
776
796
|
return await call_next(request)
|
797
|
+
if request.url.path.startswith("/metrics"):
|
798
|
+
return await call_next(request)
|
777
799
|
if request.headers.get("Authorization") != "Bearer " + api_key:
|
778
800
|
return ORJSONResponse(content={"error": "Unauthorized"}, status_code=401)
|
779
801
|
return await call_next(request)
|
@@ -930,6 +952,8 @@ def get_zmq_socket(
|
|
930
952
|
buf_size = -1
|
931
953
|
|
932
954
|
socket = context.socket(socket_type)
|
955
|
+
if endpoint.find("[") != -1:
|
956
|
+
socket.setsockopt(zmq.IPV6, 1)
|
933
957
|
|
934
958
|
def set_send_opt():
|
935
959
|
socket.setsockopt(zmq.SNDHWM, 0)
|
@@ -1146,6 +1170,20 @@ def get_hpu_memory_capacity():
|
|
1146
1170
|
)
|
1147
1171
|
|
1148
1172
|
|
1173
|
+
def get_device_memory_capacity(device: str = None):
|
1174
|
+
if is_cuda():
|
1175
|
+
gpu_mem = get_nvgpu_memory_capacity()
|
1176
|
+
elif is_hip():
|
1177
|
+
gpu_mem = get_amdgpu_memory_capacity()
|
1178
|
+
elif device == "hpu":
|
1179
|
+
gpu_mem = get_hpu_memory_capacity()
|
1180
|
+
else:
|
1181
|
+
# GPU memory is not known yet or no GPU is available.
|
1182
|
+
gpu_mem = None
|
1183
|
+
|
1184
|
+
return gpu_mem
|
1185
|
+
|
1186
|
+
|
1149
1187
|
# Copy from pytorch and OpenRLHF to allow creating multiple main groups.
|
1150
1188
|
# https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
|
1151
1189
|
# https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/utils/distributed_util.py
|
@@ -1913,6 +1951,8 @@ def is_page_size_one(server_args):
|
|
1913
1951
|
return server_args.page_size == 1
|
1914
1952
|
|
1915
1953
|
|
1954
|
+
# TODO(hebiao064): Accelerate FA3 Spec Decode with topk > 1.
|
1955
|
+
# TODO(hebiao064): Improve the acc rate for FA3 Spec Decode with topk == 1 and page_size > 1.
|
1916
1956
|
def is_no_spec_infer_or_topk_one(server_args):
|
1917
1957
|
return server_args.speculative_eagle_topk is None or (
|
1918
1958
|
server_args.speculative_eagle_topk is not None
|
@@ -1930,6 +1970,7 @@ def is_fa3_default_architecture(hf_config):
|
|
1930
1970
|
"Llama4ForConditionalGeneration",
|
1931
1971
|
"LlamaForCausalLM",
|
1932
1972
|
"MistralForCausalLM",
|
1973
|
+
"Gemma2ForCausalLM",
|
1933
1974
|
}
|
1934
1975
|
return architectures[0] in default_archs
|
1935
1976
|
|
sglang/test/test_utils.py
CHANGED
@@ -450,7 +450,9 @@ def popen_launch_server(
|
|
450
450
|
|
451
451
|
return_code = process.poll()
|
452
452
|
if return_code is not None:
|
453
|
-
raise Exception(
|
453
|
+
raise Exception(
|
454
|
+
f"Server unexpectedly exits ({return_code=}). Usually there will be error logs describing the cause far above this line."
|
455
|
+
)
|
454
456
|
|
455
457
|
time.sleep(10)
|
456
458
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.5.
|
1
|
+
__version__ = "0.4.5.post3"
|