sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/configs/step3_vl.py +172 -0
- sglang/srt/conversation.py +23 -0
- sglang/srt/disaggregation/decode.py +2 -8
- sglang/srt/disaggregation/prefill.py +2 -6
- sglang/srt/distributed/parallel_state.py +86 -1
- sglang/srt/entrypoints/engine.py +14 -18
- sglang/srt/entrypoints/http_server.py +23 -3
- sglang/srt/entrypoints/openai/protocol.py +3 -1
- sglang/srt/entrypoints/openai/serving_base.py +5 -2
- sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- sglang/srt/eplb/expert_distribution.py +5 -0
- sglang/srt/eplb/expert_location.py +17 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -0
- sglang/srt/eplb/expert_location_updater.py +2 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/step3_detector.py +436 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/jinja_template_utils.py +4 -1
- sglang/srt/layers/moe/cutlass_moe.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +98 -603
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
- sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
- sglang/srt/layers/moe/topk.py +6 -2
- sglang/srt/layers/quantization/fp8.py +0 -18
- sglang/srt/layers/quantization/modelopt_quant.py +2 -0
- sglang/srt/layers/quantization/unquant.py +0 -8
- sglang/srt/layers/quantization/w4afp8.py +1 -0
- sglang/srt/managers/cache_controller.py +143 -45
- sglang/srt/managers/data_parallel_controller.py +6 -0
- sglang/srt/managers/io_struct.py +12 -2
- sglang/srt/managers/scheduler.py +116 -669
- sglang/srt/managers/scheduler_input_blocker.py +106 -0
- sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- sglang/srt/managers/template_manager.py +62 -19
- sglang/srt/managers/tokenizer_manager.py +166 -83
- sglang/srt/managers/tp_worker.py +9 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- sglang/srt/mem_cache/hicache_storage.py +45 -11
- sglang/srt/mem_cache/hiradix_cache.py +15 -4
- sglang/srt/mem_cache/memory_pool_host.py +73 -1
- sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- sglang/srt/model_executor/model_runner.py +20 -13
- sglang/srt/models/arcee.py +532 -0
- sglang/srt/models/deepseek_v2.py +15 -56
- sglang/srt/models/glm4_moe.py +3 -1
- sglang/srt/models/granitemoe.py +3 -0
- sglang/srt/models/grok.py +3 -0
- sglang/srt/models/hunyuan.py +1 -0
- sglang/srt/models/llama4.py +3 -0
- sglang/srt/models/mixtral.py +3 -0
- sglang/srt/models/olmoe.py +3 -0
- sglang/srt/models/phimoe.py +1 -0
- sglang/srt/models/qwen3_moe.py +12 -69
- sglang/srt/models/step3_vl.py +994 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang/srt/multimodal/processors/step3_vl.py +515 -0
- sglang/srt/poll_based_barrier.py +31 -0
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +18 -13
- sglang/srt/speculative/eagle_worker.py +2 -0
- sglang/srt/two_batch_overlap.py +8 -3
- sglang/test/test_utils.py +53 -0
- sglang/utils.py +0 -11
- sglang/version.py +1 -1
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Tuple
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE, GPU_MEMORY_TYPE_WEIGHTS
|
7
|
+
from sglang.srt.managers.io_struct import (
|
8
|
+
GetWeightsByNameReqInput,
|
9
|
+
GetWeightsByNameReqOutput,
|
10
|
+
InitWeightsUpdateGroupReqInput,
|
11
|
+
InitWeightsUpdateGroupReqOutput,
|
12
|
+
ReleaseMemoryOccupationReqInput,
|
13
|
+
ReleaseMemoryOccupationReqOutput,
|
14
|
+
ResumeMemoryOccupationReqInput,
|
15
|
+
ResumeMemoryOccupationReqOutput,
|
16
|
+
UpdateWeightFromDiskReqInput,
|
17
|
+
UpdateWeightFromDiskReqOutput,
|
18
|
+
UpdateWeightsFromDistributedReqInput,
|
19
|
+
UpdateWeightsFromDistributedReqOutput,
|
20
|
+
UpdateWeightsFromTensorReqInput,
|
21
|
+
UpdateWeightsFromTensorReqOutput,
|
22
|
+
)
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
|
27
|
+
class SchedulerUpdateWeightsMixin:
|
28
|
+
|
29
|
+
def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
|
30
|
+
"""In-place update of the weights from disk."""
|
31
|
+
success, message = self.tp_worker.update_weights_from_disk(recv_req)
|
32
|
+
if success:
|
33
|
+
flush_cache_success = self.flush_cache()
|
34
|
+
assert flush_cache_success, "Cache flush failed after updating weights"
|
35
|
+
else:
|
36
|
+
logger.error(message)
|
37
|
+
return UpdateWeightFromDiskReqOutput(success, message, 0)
|
38
|
+
|
39
|
+
def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
|
40
|
+
"""Initialize the online model parameter update group."""
|
41
|
+
success, message = self.tp_worker.init_weights_update_group(recv_req)
|
42
|
+
return InitWeightsUpdateGroupReqOutput(success, message)
|
43
|
+
|
44
|
+
def update_weights_from_distributed(
|
45
|
+
self,
|
46
|
+
recv_req: UpdateWeightsFromDistributedReqInput,
|
47
|
+
) -> Tuple[bool, str]:
|
48
|
+
"""Update the online model parameter."""
|
49
|
+
success, message = self.tp_worker.update_weights_from_distributed(recv_req)
|
50
|
+
if success:
|
51
|
+
if recv_req.flush_cache:
|
52
|
+
flush_cache_success = self.flush_cache()
|
53
|
+
assert flush_cache_success, "Cache flush failed after updating weights"
|
54
|
+
else:
|
55
|
+
logger.error(message)
|
56
|
+
return UpdateWeightsFromDistributedReqOutput(success, message)
|
57
|
+
|
58
|
+
def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
|
59
|
+
"""Update the online model parameter from tensors."""
|
60
|
+
success, message = self.tp_worker.update_weights_from_tensor(recv_req)
|
61
|
+
# TODO extract common code b/t update_weights_from_distributed and update_weights_from_tensor later
|
62
|
+
if success:
|
63
|
+
if recv_req.flush_cache:
|
64
|
+
flush_cache_success = self.flush_cache()
|
65
|
+
assert flush_cache_success, "Cache flush failed after updating weights"
|
66
|
+
else:
|
67
|
+
logger.error(message)
|
68
|
+
torch.distributed.barrier(group=self.tp_cpu_group)
|
69
|
+
return UpdateWeightsFromTensorReqOutput(success, message)
|
70
|
+
|
71
|
+
def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
|
72
|
+
parameter = self.tp_worker.get_weights_by_name(recv_req)
|
73
|
+
return GetWeightsByNameReqOutput(parameter)
|
74
|
+
|
75
|
+
def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput):
|
76
|
+
tags = recv_req.tags
|
77
|
+
|
78
|
+
if tags is None or len(tags) == 0:
|
79
|
+
tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
|
80
|
+
|
81
|
+
if GPU_MEMORY_TYPE_KV_CACHE in tags:
|
82
|
+
self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_KV_CACHE)
|
83
|
+
self.flush_cache()
|
84
|
+
|
85
|
+
if GPU_MEMORY_TYPE_WEIGHTS in tags:
|
86
|
+
self.stashed_model_static_state = _export_static_state(
|
87
|
+
self.tp_worker.worker.model_runner.model
|
88
|
+
)
|
89
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
90
|
+
self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
|
91
|
+
|
92
|
+
return ReleaseMemoryOccupationReqOutput()
|
93
|
+
|
94
|
+
def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput):
|
95
|
+
tags = recv_req.tags
|
96
|
+
|
97
|
+
if tags is None or len(tags) == 0:
|
98
|
+
tags = [GPU_MEMORY_TYPE_WEIGHTS, GPU_MEMORY_TYPE_KV_CACHE]
|
99
|
+
|
100
|
+
if GPU_MEMORY_TYPE_WEIGHTS in tags:
|
101
|
+
self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
|
102
|
+
torch.distributed.barrier(self.tp_cpu_group)
|
103
|
+
_import_static_state(
|
104
|
+
self.tp_worker.worker.model_runner.model,
|
105
|
+
self.stashed_model_static_state,
|
106
|
+
)
|
107
|
+
del self.stashed_model_static_state
|
108
|
+
|
109
|
+
if GPU_MEMORY_TYPE_KV_CACHE in tags:
|
110
|
+
self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_KV_CACHE)
|
111
|
+
|
112
|
+
return ResumeMemoryOccupationReqOutput()
|
113
|
+
|
114
|
+
def save_remote_model(self, params):
|
115
|
+
url = params["url"]
|
116
|
+
|
117
|
+
worker = self.tp_worker.worker
|
118
|
+
|
119
|
+
worker.model_runner.save_remote_model(url)
|
120
|
+
|
121
|
+
def save_sharded_model(self, params):
|
122
|
+
worker = self.tp_worker.worker
|
123
|
+
|
124
|
+
worker.model_runner.save_sharded_model(
|
125
|
+
path=params["path"],
|
126
|
+
pattern=params["pattern"],
|
127
|
+
max_size=params["max_size"],
|
128
|
+
)
|
129
|
+
|
130
|
+
|
131
|
+
def _export_static_state(model):
|
132
|
+
return dict(
|
133
|
+
buffers=[
|
134
|
+
(name, buffer.detach().clone()) for name, buffer in model.named_buffers()
|
135
|
+
]
|
136
|
+
)
|
137
|
+
|
138
|
+
|
139
|
+
def _import_static_state(model, static_params):
|
140
|
+
self_named_buffers = dict(model.named_buffers())
|
141
|
+
for name, tensor in static_params["buffers"]:
|
142
|
+
self_named_buffers[name][...] = tensor
|
@@ -53,7 +53,7 @@ class TemplateManager:
|
|
53
53
|
def __init__(self):
|
54
54
|
self._chat_template_name: Optional[str] = None
|
55
55
|
self._completion_template_name: Optional[str] = None
|
56
|
-
self._jinja_template_content_format: Optional[str] =
|
56
|
+
self._jinja_template_content_format: Optional[str] = "openai"
|
57
57
|
|
58
58
|
@property
|
59
59
|
def chat_template_name(self) -> Optional[str]:
|
@@ -71,31 +71,60 @@ class TemplateManager:
|
|
71
71
|
return self._jinja_template_content_format
|
72
72
|
|
73
73
|
def load_chat_template(
|
74
|
-
self, tokenizer_manager, chat_template_arg: str, model_path: str
|
74
|
+
self, tokenizer_manager, chat_template_arg: Optional[str], model_path: str
|
75
75
|
) -> None:
|
76
76
|
"""
|
77
77
|
Load a chat template from various sources.
|
78
78
|
|
79
79
|
Args:
|
80
80
|
tokenizer_manager: The tokenizer manager instance
|
81
|
-
chat_template_arg: Template name or
|
81
|
+
chat_template_arg: Template name, file path, or None to auto-detect
|
82
82
|
model_path: Path to the model
|
83
83
|
"""
|
84
|
-
|
84
|
+
if chat_template_arg:
|
85
|
+
self._load_explicit_chat_template(tokenizer_manager, chat_template_arg)
|
86
|
+
else:
|
87
|
+
# Try HuggingFace template first
|
88
|
+
hf_template = self._resolve_hf_chat_template(tokenizer_manager)
|
89
|
+
if hf_template:
|
90
|
+
self._jinja_template_content_format = (
|
91
|
+
detect_jinja_template_content_format(hf_template)
|
92
|
+
)
|
93
|
+
logger.info(
|
94
|
+
f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
|
95
|
+
)
|
96
|
+
return
|
85
97
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
98
|
+
# Fallback to SGLang template guessing
|
99
|
+
self.guess_chat_template_from_model_path(model_path)
|
100
|
+
|
101
|
+
# Set default format if no template was found
|
102
|
+
if self._chat_template_name is None:
|
103
|
+
self._jinja_template_content_format = "string"
|
104
|
+
logger.info(
|
105
|
+
"No chat template found, defaulting to 'string' content format"
|
91
106
|
)
|
92
107
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
108
|
+
def _load_explicit_chat_template(
|
109
|
+
self, tokenizer_manager, chat_template_arg: str
|
110
|
+
) -> None:
|
111
|
+
"""Load explicitly specified chat template."""
|
112
|
+
logger.info(f"Loading chat template from argument: {chat_template_arg}")
|
113
|
+
|
114
|
+
if chat_template_exists(chat_template_arg):
|
98
115
|
self._chat_template_name = chat_template_arg
|
116
|
+
return
|
117
|
+
|
118
|
+
if not os.path.exists(chat_template_arg):
|
119
|
+
raise RuntimeError(
|
120
|
+
f"Chat template {chat_template_arg} is not a built-in template name "
|
121
|
+
"or a valid chat template file path."
|
122
|
+
)
|
123
|
+
|
124
|
+
if chat_template_arg.endswith(".jinja"):
|
125
|
+
self._load_jinja_template(tokenizer_manager, chat_template_arg)
|
126
|
+
else:
|
127
|
+
self._load_json_chat_template(chat_template_arg)
|
99
128
|
|
100
129
|
def guess_chat_template_from_model_path(self, model_path: str) -> None:
|
101
130
|
"""
|
@@ -146,10 +175,7 @@ class TemplateManager:
|
|
146
175
|
completion_template: Optional completion template name/path
|
147
176
|
"""
|
148
177
|
# Load chat template
|
149
|
-
|
150
|
-
self.load_chat_template(tokenizer_manager, chat_template, model_path)
|
151
|
-
else:
|
152
|
-
self.guess_chat_template_from_model_path(model_path)
|
178
|
+
self.load_chat_template(tokenizer_manager, chat_template, model_path)
|
153
179
|
|
154
180
|
# Load completion template
|
155
181
|
if completion_template:
|
@@ -166,7 +192,7 @@ class TemplateManager:
|
|
166
192
|
chat_template
|
167
193
|
)
|
168
194
|
logger.info(
|
169
|
-
f"Detected chat template content format: {self._jinja_template_content_format}"
|
195
|
+
f"Detected user specified Jinja chat template with content format: {self._jinja_template_content_format}"
|
170
196
|
)
|
171
197
|
|
172
198
|
def _load_json_chat_template(self, template_path: str) -> None:
|
@@ -224,3 +250,20 @@ class TemplateManager:
|
|
224
250
|
override=True,
|
225
251
|
)
|
226
252
|
self._completion_template_name = template["name"]
|
253
|
+
|
254
|
+
def _resolve_hf_chat_template(self, tokenizer_manager) -> Optional[str]:
|
255
|
+
"""
|
256
|
+
Resolve HuggingFace chat template.
|
257
|
+
|
258
|
+
Returns the chat template string if found, None otherwise.
|
259
|
+
"""
|
260
|
+
tokenizer = tokenizer_manager.tokenizer
|
261
|
+
|
262
|
+
# Try to get AutoTokenizer chat template
|
263
|
+
try:
|
264
|
+
return tokenizer.get_chat_template()
|
265
|
+
except Exception as e:
|
266
|
+
logger.debug(f"Error getting chat template via get_chat_template(): {e}")
|
267
|
+
|
268
|
+
logger.debug("No HuggingFace chat template found")
|
269
|
+
return None
|