sglang 0.5.1.post3__py3-none-any.whl → 0.5.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +12 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/launch_lb.py +0 -13
- sglang/srt/disaggregation/mini_lb.py +33 -8
- sglang/srt/disaggregation/prefill.py +1 -1
- sglang/srt/distributed/parallel_state.py +24 -14
- sglang/srt/entrypoints/engine.py +19 -12
- sglang/srt/entrypoints/http_server.py +174 -34
- sglang/srt/entrypoints/openai/protocol.py +60 -0
- sglang/srt/eplb/eplb_manager.py +26 -2
- sglang/srt/eplb/expert_distribution.py +29 -2
- sglang/srt/hf_transformers_utils.py +10 -0
- sglang/srt/layers/activation.py +12 -0
- sglang/srt/layers/attention/ascend_backend.py +240 -109
- sglang/srt/layers/attention/hybrid_attn_backend.py +53 -21
- sglang/srt/layers/attention/trtllm_mla_backend.py +25 -10
- sglang/srt/layers/layernorm.py +28 -3
- sglang/srt/layers/linear.py +3 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +12 -6
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/topk.py +35 -12
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
- sglang/srt/layers/quantization/modelopt_quant.py +7 -0
- sglang/srt/layers/quantization/mxfp4.py +9 -4
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -3
- sglang/srt/layers/rotary_embedding.py +28 -1
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/managers/cache_controller.py +62 -96
- sglang/srt/managers/detokenizer_manager.py +43 -2
- sglang/srt/managers/io_struct.py +27 -0
- sglang/srt/managers/mm_utils.py +5 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +591 -0
- sglang/srt/managers/scheduler.py +36 -2
- sglang/srt/managers/scheduler_output_processor_mixin.py +20 -18
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/tokenizer_manager.py +86 -39
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +20 -3
- sglang/srt/mem_cache/hiradix_cache.py +75 -68
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +4 -0
- sglang/srt/mem_cache/memory_pool_host.py +2 -4
- sglang/srt/mem_cache/radix_cache.py +5 -4
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +33 -7
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +2 -1
- sglang/srt/mem_cache/swa_radix_cache.py +1 -1
- sglang/srt/model_executor/model_runner.py +5 -4
- sglang/srt/model_loader/loader.py +15 -24
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/models/deepseek_v2.py +26 -10
- sglang/srt/models/gpt_oss.py +0 -14
- sglang/srt/models/llama_eagle3.py +4 -0
- sglang/srt/models/longcat_flash.py +1015 -0
- sglang/srt/models/longcat_flash_nextn.py +691 -0
- sglang/srt/models/qwen2.py +26 -3
- sglang/srt/models/qwen2_5_vl.py +65 -41
- sglang/srt/models/qwen2_moe.py +22 -2
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/server_args.py +112 -55
- sglang/srt/speculative/eagle_worker.py +28 -8
- sglang/srt/utils.py +14 -0
- sglang/test/attention/test_trtllm_mla_backend.py +12 -3
- sglang/version.py +1 -1
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/METADATA +5 -5
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/RECORD +83 -78
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2rc0.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ import json
|
|
23
23
|
import logging
|
24
24
|
import multiprocessing as multiprocessing
|
25
25
|
import os
|
26
|
+
import tempfile
|
26
27
|
import threading
|
27
28
|
import time
|
28
29
|
from http import HTTPStatus
|
@@ -91,11 +92,18 @@ from sglang.srt.managers.io_struct import (
|
|
91
92
|
UpdateWeightVersionReqInput,
|
92
93
|
VertexGenerateReqInput,
|
93
94
|
)
|
95
|
+
from sglang.srt.managers.multi_tokenizer_mixin import (
|
96
|
+
MultiTokenizerManager,
|
97
|
+
deserialize_data,
|
98
|
+
get_main_process_id,
|
99
|
+
read_from_shared_memory,
|
100
|
+
write_data_for_multi_tokenizer,
|
101
|
+
)
|
94
102
|
from sglang.srt.managers.template_manager import TemplateManager
|
95
103
|
from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
|
96
104
|
from sglang.srt.metrics.func_timer import enable_func_timer
|
97
105
|
from sglang.srt.reasoning_parser import ReasoningParser
|
98
|
-
from sglang.srt.server_args import ServerArgs
|
106
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
99
107
|
from sglang.srt.utils import (
|
100
108
|
add_api_key_middleware,
|
101
109
|
add_prometheus_middleware,
|
@@ -130,8 +138,79 @@ def set_global_state(global_state: _GlobalState):
|
|
130
138
|
_global_state = global_state
|
131
139
|
|
132
140
|
|
141
|
+
# Function to set up all middlewares for multi-tokenizer compatibility
|
142
|
+
def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
|
143
|
+
"""Setup all middlewares for both single and multi-process modes"""
|
144
|
+
worker_pid = os.getpid()
|
145
|
+
|
146
|
+
if api_key:
|
147
|
+
add_api_key_middleware(app, api_key)
|
148
|
+
logger.info(f"Worker {worker_pid} added API key middleware")
|
149
|
+
|
150
|
+
if enable_metrics:
|
151
|
+
add_prometheus_middleware(app)
|
152
|
+
enable_func_timer()
|
153
|
+
logger.info(f"Worker {worker_pid} added prometheus middleware")
|
154
|
+
|
155
|
+
|
156
|
+
async def init_multi_tokenizer() -> ServerArgs:
|
157
|
+
"""Read args information from shm and init tokenizer manager for current process"""
|
158
|
+
pid = os.getpid()
|
159
|
+
main_pid = get_main_process_id()
|
160
|
+
logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
|
161
|
+
|
162
|
+
# Read configuration from shared memory
|
163
|
+
port_args_data = read_from_shared_memory(f"port_args_{main_pid}")
|
164
|
+
server_args_data = read_from_shared_memory(f"server_args_{main_pid}")
|
165
|
+
scheduler_info_data = read_from_shared_memory(f"scheduler_info_{main_pid}")
|
166
|
+
port_args, server_args = deserialize_data(port_args_data, server_args_data)
|
167
|
+
scheduler_info = scheduler_info_data
|
168
|
+
|
169
|
+
port_args.tokenizer_ipc_name = (
|
170
|
+
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
171
|
+
)
|
172
|
+
|
173
|
+
# Launch multi-tokenizer manager process
|
174
|
+
tokenizer_manager = MultiTokenizerManager(server_args, port_args)
|
175
|
+
template_manager = TemplateManager()
|
176
|
+
template_manager.initialize_templates(
|
177
|
+
tokenizer_manager=tokenizer_manager,
|
178
|
+
model_path=server_args.model_path,
|
179
|
+
chat_template=server_args.chat_template,
|
180
|
+
completion_template=server_args.completion_template,
|
181
|
+
)
|
182
|
+
# Register this tokenizer with the main tokenizer manager
|
183
|
+
await tokenizer_manager.register_to_main_tokenizer_manager()
|
184
|
+
|
185
|
+
tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
|
186
|
+
set_global_state(
|
187
|
+
_GlobalState(
|
188
|
+
tokenizer_manager=tokenizer_manager,
|
189
|
+
template_manager=template_manager,
|
190
|
+
scheduler_info=scheduler_info,
|
191
|
+
)
|
192
|
+
)
|
193
|
+
return server_args
|
194
|
+
|
195
|
+
|
133
196
|
@asynccontextmanager
|
134
197
|
async def lifespan(fast_api_app: FastAPI):
|
198
|
+
server_args = getattr(fast_api_app, "server_args", None)
|
199
|
+
if server_args is None:
|
200
|
+
# Initialize multi-tokenizer support for worker processes
|
201
|
+
fast_api_app.server_args = await init_multi_tokenizer()
|
202
|
+
setup_middlewares(
|
203
|
+
fast_api_app.server_args.api_key, fast_api_app.server_args.enable_metrics
|
204
|
+
)
|
205
|
+
fast_api_app.warmup_thread = threading.Thread(
|
206
|
+
target=_wait_and_warmup,
|
207
|
+
args=(
|
208
|
+
fast_api_app.server_args,
|
209
|
+
None, # pipe_finish_writer not needed in worker
|
210
|
+
None, # launch_callback not needed in worker
|
211
|
+
),
|
212
|
+
)
|
213
|
+
|
135
214
|
# Initialize OpenAI serving handlers
|
136
215
|
fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
|
137
216
|
_global_state.tokenizer_manager, _global_state.template_manager
|
@@ -191,7 +270,15 @@ async def lifespan(fast_api_app: FastAPI):
|
|
191
270
|
warmup_thread = getattr(fast_api_app, "warmup_thread", None)
|
192
271
|
if warmup_thread is not None:
|
193
272
|
warmup_thread.start()
|
194
|
-
|
273
|
+
|
274
|
+
try:
|
275
|
+
yield
|
276
|
+
finally:
|
277
|
+
if server_args.tokenizer_worker_num > 1:
|
278
|
+
pid = os.getpid()
|
279
|
+
logger.info(f"uvicorn worker {pid} ending...")
|
280
|
+
warmup_thread.join()
|
281
|
+
logger.info(f"uvicorn worker {pid} ended.")
|
195
282
|
|
196
283
|
|
197
284
|
# Fast API
|
@@ -480,6 +567,16 @@ async def flush_cache():
|
|
480
567
|
)
|
481
568
|
|
482
569
|
|
570
|
+
@app.api_route("/clear_hicache_storage_backend", methods=["GET", "POST"])
|
571
|
+
async def clear_hicache_storage_backend():
|
572
|
+
"""Clear the hierarchical cache storage backend."""
|
573
|
+
ret = await _global_state.tokenizer_manager.clear_hicache_storage()
|
574
|
+
return Response(
|
575
|
+
content="Hierarchical cache storage backend cleared.\n",
|
576
|
+
status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
|
577
|
+
)
|
578
|
+
|
579
|
+
|
483
580
|
@app.api_route("/start_profile", methods=["GET", "POST"])
|
484
581
|
async def start_profile_async(obj: Optional[ProfileReqInput] = None):
|
485
582
|
"""Start profiling."""
|
@@ -1068,9 +1165,19 @@ def launch_server(
|
|
1068
1165
|
1. The HTTP server, Engine, and TokenizerManager both run in the main process.
|
1069
1166
|
2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
|
1070
1167
|
"""
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1168
|
+
if server_args.tokenizer_worker_num > 1:
|
1169
|
+
port_args = PortArgs.init_new(server_args)
|
1170
|
+
port_args.tokenizer_worker_ipc_name = (
|
1171
|
+
f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
|
1172
|
+
)
|
1173
|
+
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1174
|
+
server_args=server_args, port_args=port_args
|
1175
|
+
)
|
1176
|
+
else:
|
1177
|
+
tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
|
1178
|
+
server_args=server_args,
|
1179
|
+
)
|
1180
|
+
|
1074
1181
|
set_global_state(
|
1075
1182
|
_GlobalState(
|
1076
1183
|
tokenizer_manager=tokenizer_manager,
|
@@ -1079,42 +1186,75 @@ def launch_server(
|
|
1079
1186
|
)
|
1080
1187
|
)
|
1081
1188
|
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1189
|
+
if server_args.tokenizer_worker_num > 1:
|
1190
|
+
port_args_shm, server_args_shm, scheduler_info_shm = (
|
1191
|
+
write_data_for_multi_tokenizer(
|
1192
|
+
port_args,
|
1193
|
+
server_args,
|
1194
|
+
scheduler_info,
|
1195
|
+
)
|
1196
|
+
)
|
1197
|
+
else:
|
1198
|
+
# Add api key authorization
|
1199
|
+
if server_args.api_key:
|
1200
|
+
add_api_key_middleware(app, server_args.api_key)
|
1201
|
+
|
1202
|
+
# Add prometheus middleware
|
1203
|
+
if server_args.enable_metrics:
|
1204
|
+
add_prometheus_middleware(app)
|
1205
|
+
enable_func_timer()
|
1206
|
+
|
1207
|
+
# Send a warmup request - we will create the thread launch it
|
1208
|
+
# in the lifespan after all other warmups have fired.
|
1209
|
+
warmup_thread = threading.Thread(
|
1210
|
+
target=_wait_and_warmup,
|
1211
|
+
args=(
|
1212
|
+
server_args,
|
1213
|
+
pipe_finish_writer,
|
1214
|
+
launch_callback,
|
1215
|
+
),
|
1216
|
+
)
|
1217
|
+
app.warmup_thread = warmup_thread
|
1102
1218
|
|
1103
1219
|
try:
|
1104
1220
|
# Update logging configs
|
1105
1221
|
set_uvicorn_logging_configs()
|
1106
1222
|
app.server_args = server_args
|
1107
1223
|
# Listen for HTTP requests
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1224
|
+
if server_args.tokenizer_worker_num > 1:
|
1225
|
+
from uvicorn.config import LOGGING_CONFIG
|
1226
|
+
|
1227
|
+
LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
|
1228
|
+
"handlers": ["default"],
|
1229
|
+
"level": "INFO",
|
1230
|
+
"propagate": False,
|
1231
|
+
}
|
1232
|
+
uvicorn.run(
|
1233
|
+
"sglang.srt.entrypoints.http_server:app",
|
1234
|
+
host=server_args.host,
|
1235
|
+
port=server_args.port,
|
1236
|
+
log_level=server_args.log_level_http or server_args.log_level,
|
1237
|
+
timeout_keep_alive=5,
|
1238
|
+
loop="uvloop",
|
1239
|
+
workers=server_args.tokenizer_worker_num,
|
1240
|
+
)
|
1241
|
+
else:
|
1242
|
+
uvicorn.run(
|
1243
|
+
app,
|
1244
|
+
host=server_args.host,
|
1245
|
+
port=server_args.port,
|
1246
|
+
log_level=server_args.log_level_http or server_args.log_level,
|
1247
|
+
timeout_keep_alive=5,
|
1248
|
+
loop="uvloop",
|
1249
|
+
)
|
1116
1250
|
finally:
|
1117
|
-
|
1251
|
+
if server_args.tokenizer_worker_num > 1:
|
1252
|
+
port_args_shm.unlink()
|
1253
|
+
server_args_shm.unlink()
|
1254
|
+
scheduler_info_shm.unlink()
|
1255
|
+
_global_state.tokenizer_manager.clear_tokenizer_mapping()
|
1256
|
+
else:
|
1257
|
+
warmup_thread.join()
|
1118
1258
|
|
1119
1259
|
|
1120
1260
|
def _execute_server_warmup(
|
@@ -460,6 +460,66 @@ class ChatCompletionRequest(BaseModel):
|
|
460
460
|
values["tool_choice"] = "auto"
|
461
461
|
return values
|
462
462
|
|
463
|
+
@model_validator(mode="before")
|
464
|
+
@classmethod
|
465
|
+
def normalize_reasoning_inputs(cls, values: Dict):
|
466
|
+
r = values.get("reasoning")
|
467
|
+
if r is None:
|
468
|
+
return values
|
469
|
+
|
470
|
+
if isinstance(r, dict):
|
471
|
+
effort = r.get("effort") or r.get("reasoning_effort")
|
472
|
+
if effort in {"low", "medium", "high"}:
|
473
|
+
values["reasoning_effort"] = effort
|
474
|
+
|
475
|
+
enabled = (
|
476
|
+
r.get("enabled")
|
477
|
+
if r.get("enabled") is not None
|
478
|
+
else r.get("enable", False)
|
479
|
+
)
|
480
|
+
if isinstance(enabled, str):
|
481
|
+
enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
|
482
|
+
if enabled:
|
483
|
+
ctk = values.get("chat_template_kwargs")
|
484
|
+
if not isinstance(ctk, dict):
|
485
|
+
ctk = {}
|
486
|
+
ctk.setdefault("thinking", True)
|
487
|
+
values["chat_template_kwargs"] = ctk
|
488
|
+
|
489
|
+
return values
|
490
|
+
|
491
|
+
@model_validator(mode="before")
|
492
|
+
@classmethod
|
493
|
+
def set_json_schema(cls, values):
|
494
|
+
response_format = values.get("response_format")
|
495
|
+
if not response_format:
|
496
|
+
return values
|
497
|
+
|
498
|
+
if response_format.get("type") != "json_schema":
|
499
|
+
return values
|
500
|
+
|
501
|
+
schema = response_format.pop("schema", None)
|
502
|
+
json_schema = response_format.get("json_schema")
|
503
|
+
|
504
|
+
if json_schema:
|
505
|
+
return values
|
506
|
+
|
507
|
+
if schema:
|
508
|
+
name_ = schema.get("title", "Schema")
|
509
|
+
strict_ = False
|
510
|
+
if "properties" in schema and "strict" in schema["properties"]:
|
511
|
+
item = schema["properties"].pop("strict", None)
|
512
|
+
if item and item.get("default", False):
|
513
|
+
strict_ = True
|
514
|
+
|
515
|
+
response_format["json_schema"] = {
|
516
|
+
"name": name_,
|
517
|
+
"schema": schema,
|
518
|
+
"strict": strict_,
|
519
|
+
}
|
520
|
+
|
521
|
+
return values
|
522
|
+
|
463
523
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
464
524
|
top_k: int = -1
|
465
525
|
min_p: float = 0.0
|
sglang/srt/eplb/eplb_manager.py
CHANGED
@@ -58,9 +58,18 @@ class EPLBManager:
|
|
58
58
|
torch.cuda.synchronize()
|
59
59
|
time_start = time.time()
|
60
60
|
|
61
|
-
|
61
|
+
dump_record_output = get_global_expert_distribution_recorder().dump_record(
|
62
62
|
output_mode="object"
|
63
|
-
)
|
63
|
+
)
|
64
|
+
logical_count = dump_record_output["logical_count"]
|
65
|
+
average_utilization_rate_over_window = dump_record_output[
|
66
|
+
"average_utilization_rate_over_window"
|
67
|
+
]
|
68
|
+
|
69
|
+
# Check whether rebalancing is needed
|
70
|
+
if not self._check_rebalance_needed(average_utilization_rate_over_window):
|
71
|
+
return
|
72
|
+
|
64
73
|
expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
|
65
74
|
self._server_args, self._model_runner.model_config, logical_count
|
66
75
|
)
|
@@ -81,6 +90,21 @@ class EPLBManager:
|
|
81
90
|
msg += f" time={time_end - time_start:.3f}s"
|
82
91
|
logger.info(msg)
|
83
92
|
|
93
|
+
def _check_rebalance_needed(self, average_utilization_rate_over_window):
|
94
|
+
if average_utilization_rate_over_window is None:
|
95
|
+
return True
|
96
|
+
|
97
|
+
if (
|
98
|
+
average_utilization_rate_over_window
|
99
|
+
> self._server_args.eplb_min_rebalancing_utilization_threshold
|
100
|
+
):
|
101
|
+
logger.info(
|
102
|
+
f"[EPLBManager] Skipped ep rebalancing: current GPU utilization {average_utilization_rate_over_window:.2f} > minimum rebalance threshold {self._server_args.eplb_min_rebalancing_utilization_threshold:.2f}"
|
103
|
+
)
|
104
|
+
return False
|
105
|
+
|
106
|
+
return True
|
107
|
+
|
84
108
|
def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
|
85
109
|
all_layer_ids = sorted(
|
86
110
|
list(self._model_runner.model.routed_experts_weights_of_layer.keys())
|
@@ -12,6 +12,7 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
import logging
|
15
|
+
import math
|
15
16
|
import os
|
16
17
|
import time
|
17
18
|
from abc import ABC
|
@@ -614,8 +615,8 @@ class _UtilizationRateAccumulatorMixin(_Accumulator):
|
|
614
615
|
self._enable = self._server_args.enable_expert_distribution_metrics
|
615
616
|
|
616
617
|
if self._enable:
|
617
|
-
window_sizes = [10, 100, 1000]
|
618
|
-
self._history = _DequeCollection(maxlens=window_sizes)
|
618
|
+
self.window_sizes = [10, 100, 1000]
|
619
|
+
self._history = _DequeCollection(maxlens=self.window_sizes)
|
619
620
|
self._rank = torch.distributed.get_rank()
|
620
621
|
|
621
622
|
def append(
|
@@ -787,6 +788,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
|
787
788
|
output = dict(
|
788
789
|
rank=self._rank,
|
789
790
|
logical_count=logical_count_of_buffered_step,
|
791
|
+
average_utilization_rate_over_window=self._get_global_average_utilization_rate(),
|
790
792
|
)
|
791
793
|
|
792
794
|
if output_mode == "file":
|
@@ -797,6 +799,31 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
|
|
797
799
|
else:
|
798
800
|
raise NotImplementedError
|
799
801
|
|
802
|
+
def _get_global_average_utilization_rate(self):
|
803
|
+
if not self._enable or math.isclose(
|
804
|
+
self._server_args.eplb_min_rebalancing_utilization_threshold, 1.0
|
805
|
+
):
|
806
|
+
return None
|
807
|
+
|
808
|
+
if self._rank == 0:
|
809
|
+
utilization_mean_rates = self._history.mean()
|
810
|
+
window_index = self.window_sizes[-1]
|
811
|
+
average_utilization_rate_over_window = (
|
812
|
+
utilization_mean_rates[window_index]
|
813
|
+
if window_index in utilization_mean_rates
|
814
|
+
else 0
|
815
|
+
)
|
816
|
+
|
817
|
+
avg_rate_tensor = torch.tensor(
|
818
|
+
[average_utilization_rate_over_window],
|
819
|
+
dtype=torch.float32,
|
820
|
+
device="cuda",
|
821
|
+
)
|
822
|
+
else:
|
823
|
+
avg_rate_tensor = torch.empty(1, dtype=torch.float32, device="cuda")
|
824
|
+
torch.distributed.broadcast(avg_rate_tensor, src=0)
|
825
|
+
return avg_rate_tensor.item()
|
826
|
+
|
800
827
|
|
801
828
|
def _dump_to_file(name, data):
|
802
829
|
save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
|
@@ -40,6 +40,7 @@ from sglang.srt.configs import (
|
|
40
40
|
DeepseekVL2Config,
|
41
41
|
ExaoneConfig,
|
42
42
|
KimiVLConfig,
|
43
|
+
LongcatFlashConfig,
|
43
44
|
MultiModalityConfig,
|
44
45
|
Step3VLConfig,
|
45
46
|
)
|
@@ -56,6 +57,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
|
56
57
|
KimiVLConfig.model_type: KimiVLConfig,
|
57
58
|
InternVLChatConfig.model_type: InternVLChatConfig,
|
58
59
|
Step3VLConfig.model_type: Step3VLConfig,
|
60
|
+
LongcatFlashConfig.model_type: LongcatFlashConfig,
|
59
61
|
}
|
60
62
|
|
61
63
|
for name, cls in _CONFIG_REGISTRY.items():
|
@@ -126,6 +128,14 @@ def get_config(
|
|
126
128
|
kwargs["gguf_file"] = model
|
127
129
|
model = Path(model).parent
|
128
130
|
|
131
|
+
if is_remote_url(model):
|
132
|
+
# BaseConnector implements __del__() to clean up the local dir.
|
133
|
+
# Since config files need to exist all the time, so we DO NOT use
|
134
|
+
# with statement to avoid closing the client.
|
135
|
+
client = create_remote_connector(model)
|
136
|
+
client.pull_files(ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
|
137
|
+
model = client.get_local_dir()
|
138
|
+
|
129
139
|
config = AutoConfig.from_pretrained(
|
130
140
|
model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
|
131
141
|
)
|
sglang/srt/layers/activation.py
CHANGED
@@ -103,6 +103,15 @@ class GeluAndMul(CustomOp):
|
|
103
103
|
raise RuntimeError("GeluAndMul only support tanh or none")
|
104
104
|
return out
|
105
105
|
|
106
|
+
def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
|
107
|
+
y_npu, gelu_npu = torch_npu.npu_geglu(
|
108
|
+
x,
|
109
|
+
dim=-1,
|
110
|
+
approximate=1 if self.approximate == "tanh" else 0,
|
111
|
+
activate_left=True,
|
112
|
+
)
|
113
|
+
return y_npu
|
114
|
+
|
106
115
|
|
107
116
|
class NewGELU(CustomOp):
|
108
117
|
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
|
@@ -137,6 +146,9 @@ class QuickGELU(CustomOp):
|
|
137
146
|
gelu_quick(x, out)
|
138
147
|
return out
|
139
148
|
|
149
|
+
def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
|
150
|
+
return torch_npu.npu_fast_gelu(x)
|
151
|
+
|
140
152
|
|
141
153
|
class ScaledActivation(nn.Module):
|
142
154
|
"""An activation function with post-scale parameters.
|