sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/model_config.py +6 -0
- sglang/srt/configs/step3_vl.py +172 -0
- sglang/srt/conversation.py +23 -0
- sglang/srt/disaggregation/decode.py +2 -8
- sglang/srt/disaggregation/prefill.py +2 -6
- sglang/srt/distributed/parallel_state.py +86 -1
- sglang/srt/entrypoints/engine.py +14 -18
- sglang/srt/entrypoints/http_server.py +23 -3
- sglang/srt/entrypoints/openai/protocol.py +3 -1
- sglang/srt/entrypoints/openai/serving_base.py +5 -2
- sglang/srt/entrypoints/openai/serving_chat.py +2 -21
- sglang/srt/eplb/expert_distribution.py +5 -0
- sglang/srt/eplb/expert_location.py +17 -6
- sglang/srt/eplb/expert_location_dispatch.py +1 -0
- sglang/srt/eplb/expert_location_updater.py +2 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/step3_detector.py +436 -0
- sglang/srt/hf_transformers_utils.py +2 -0
- sglang/srt/jinja_template_utils.py +4 -1
- sglang/srt/layers/moe/cutlass_moe.py +2 -1
- sglang/srt/layers/moe/ep_moe/layer.py +98 -603
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
- sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
- sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
- sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
- sglang/srt/layers/moe/topk.py +6 -2
- sglang/srt/layers/quantization/fp8.py +0 -18
- sglang/srt/layers/quantization/modelopt_quant.py +2 -0
- sglang/srt/layers/quantization/unquant.py +0 -8
- sglang/srt/layers/quantization/w4afp8.py +1 -0
- sglang/srt/managers/cache_controller.py +143 -45
- sglang/srt/managers/data_parallel_controller.py +6 -0
- sglang/srt/managers/io_struct.py +12 -2
- sglang/srt/managers/scheduler.py +116 -669
- sglang/srt/managers/scheduler_input_blocker.py +106 -0
- sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
- sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
- sglang/srt/managers/template_manager.py +62 -19
- sglang/srt/managers/tokenizer_manager.py +166 -83
- sglang/srt/managers/tp_worker.py +9 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
- sglang/srt/mem_cache/hicache_storage.py +45 -11
- sglang/srt/mem_cache/hiradix_cache.py +15 -4
- sglang/srt/mem_cache/memory_pool_host.py +73 -1
- sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
- sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
- sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
- sglang/srt/model_executor/model_runner.py +20 -13
- sglang/srt/models/arcee.py +532 -0
- sglang/srt/models/deepseek_v2.py +15 -56
- sglang/srt/models/glm4_moe.py +3 -1
- sglang/srt/models/granitemoe.py +3 -0
- sglang/srt/models/grok.py +3 -0
- sglang/srt/models/hunyuan.py +1 -0
- sglang/srt/models/llama4.py +3 -0
- sglang/srt/models/mixtral.py +3 -0
- sglang/srt/models/olmoe.py +3 -0
- sglang/srt/models/phimoe.py +1 -0
- sglang/srt/models/qwen3_moe.py +12 -69
- sglang/srt/models/step3_vl.py +994 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -16
- sglang/srt/multimodal/processors/step3_vl.py +515 -0
- sglang/srt/poll_based_barrier.py +31 -0
- sglang/srt/reasoning_parser.py +2 -1
- sglang/srt/server_args.py +18 -13
- sglang/srt/speculative/eagle_worker.py +2 -0
- sglang/srt/two_batch_overlap.py +8 -3
- sglang/test/test_utils.py +53 -0
- sglang/utils.py +0 -11
- sglang/version.py +1 -1
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -19,6 +19,7 @@ import json
|
|
19
19
|
import logging
|
20
20
|
import os
|
21
21
|
import random
|
22
|
+
import sys
|
22
23
|
import tempfile
|
23
24
|
from typing import List, Literal, Optional, Union
|
24
25
|
|
@@ -74,6 +75,7 @@ class ServerArgs:
|
|
74
75
|
# Memory and scheduling
|
75
76
|
mem_fraction_static: Optional[float] = None
|
76
77
|
max_running_requests: Optional[int] = None
|
78
|
+
max_queued_requests: Optional[int] = sys.maxsize
|
77
79
|
max_total_tokens: Optional[int] = None
|
78
80
|
chunked_prefill_size: Optional[int] = None
|
79
81
|
max_prefill_tokens: int = 16384
|
@@ -268,14 +270,6 @@ class ServerArgs:
|
|
268
270
|
sm_group_num: int = 3
|
269
271
|
|
270
272
|
def __post_init__(self):
|
271
|
-
# Expert parallelism
|
272
|
-
# We put it here first due to some internal ckpt conversation issues.
|
273
|
-
if self.enable_ep_moe:
|
274
|
-
self.ep_size = self.tp_size
|
275
|
-
logger.warning(
|
276
|
-
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
277
|
-
)
|
278
|
-
|
279
273
|
# Set missing default values
|
280
274
|
if self.tokenizer_path is None:
|
281
275
|
self.tokenizer_path = self.model_path
|
@@ -805,6 +799,12 @@ class ServerArgs:
|
|
805
799
|
default=ServerArgs.max_running_requests,
|
806
800
|
help="The maximum number of running requests.",
|
807
801
|
)
|
802
|
+
parser.add_argument(
|
803
|
+
"--max-queued-requests",
|
804
|
+
type=int,
|
805
|
+
default=ServerArgs.max_queued_requests,
|
806
|
+
help="The maximum number of queued requests. This option is ignored when using disaggregation-mode.",
|
807
|
+
)
|
808
808
|
parser.add_argument(
|
809
809
|
"--max-total-tokens",
|
810
810
|
type=int,
|
@@ -1109,9 +1109,10 @@ class ServerArgs:
|
|
1109
1109
|
"kimi_k2",
|
1110
1110
|
"qwen3_coder",
|
1111
1111
|
"glm45",
|
1112
|
+
"step3",
|
1112
1113
|
],
|
1113
1114
|
default=ServerArgs.tool_call_parser,
|
1114
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and '
|
1115
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
|
1115
1116
|
)
|
1116
1117
|
|
1117
1118
|
# Data parallelism
|
@@ -1326,6 +1327,7 @@ class ServerArgs:
|
|
1326
1327
|
parser.add_argument(
|
1327
1328
|
"--expert-parallel-size",
|
1328
1329
|
"--ep-size",
|
1330
|
+
"--ep",
|
1329
1331
|
type=int,
|
1330
1332
|
default=ServerArgs.ep_size,
|
1331
1333
|
help="The expert parallelism size.",
|
@@ -1468,7 +1470,7 @@ class ServerArgs:
|
|
1468
1470
|
parser.add_argument(
|
1469
1471
|
"--hicache-storage-backend",
|
1470
1472
|
type=str,
|
1471
|
-
choices=["file"
|
1473
|
+
choices=["file", "mooncake", "hf3fs"],
|
1472
1474
|
default=ServerArgs.hicache_storage_backend,
|
1473
1475
|
help="The storage backend for hierarchical KV cache.",
|
1474
1476
|
)
|
@@ -2063,6 +2065,9 @@ class PortArgs:
|
|
2063
2065
|
|
2064
2066
|
dist_init_host, dist_init_port = dist_init_addr
|
2065
2067
|
port_base = int(dist_init_port) + 1
|
2068
|
+
detokenizer_port = port_base + 1
|
2069
|
+
rpc_port = port_base + 2
|
2070
|
+
metrics_ipc_name = port_base + 3
|
2066
2071
|
if dp_rank is None:
|
2067
2072
|
# TokenizerManager to DataParallelController
|
2068
2073
|
scheduler_input_port = port_base + 4
|
@@ -2072,10 +2077,10 @@ class PortArgs:
|
|
2072
2077
|
return PortArgs(
|
2073
2078
|
tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
|
2074
2079
|
scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
|
2075
|
-
detokenizer_ipc_name=f"tcp://{dist_init_host}:{
|
2080
|
+
detokenizer_ipc_name=f"tcp://{dist_init_host}:{detokenizer_port}",
|
2076
2081
|
nccl_port=nccl_port,
|
2077
|
-
rpc_ipc_name=f"tcp://{dist_init_host}:{
|
2078
|
-
metrics_ipc_name=f"tcp://{dist_init_host}:{
|
2082
|
+
rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
|
2083
|
+
metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
|
2079
2084
|
)
|
2080
2085
|
|
2081
2086
|
|
@@ -73,6 +73,7 @@ class EAGLEWorker(TpModelWorker):
|
|
73
73
|
gpu_id: int,
|
74
74
|
tp_rank: int,
|
75
75
|
dp_rank: Optional[int],
|
76
|
+
moe_ep_rank: int,
|
76
77
|
nccl_port: int,
|
77
78
|
target_worker: TpModelWorker,
|
78
79
|
):
|
@@ -127,6 +128,7 @@ class EAGLEWorker(TpModelWorker):
|
|
127
128
|
tp_rank=tp_rank,
|
128
129
|
pp_rank=0, # FIXME
|
129
130
|
dp_rank=dp_rank,
|
131
|
+
moe_ep_rank=moe_ep_rank,
|
130
132
|
nccl_port=nccl_port,
|
131
133
|
is_draft_worker=True,
|
132
134
|
req_to_token_pool=self.req_to_token_pool,
|
sglang/srt/two_batch_overlap.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import dataclasses
|
2
4
|
import logging
|
3
5
|
from dataclasses import replace
|
4
|
-
from typing import Dict, List, Optional, Sequence, Union
|
6
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Union
|
5
7
|
|
6
8
|
import torch
|
7
9
|
|
@@ -20,6 +22,9 @@ from sglang.srt.operations_strategy import OperationsStrategy
|
|
20
22
|
from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
|
21
23
|
from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
|
22
24
|
|
25
|
+
if TYPE_CHECKING:
|
26
|
+
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
|
27
|
+
|
23
28
|
_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
|
24
29
|
|
25
30
|
logger = logging.getLogger(__name__)
|
@@ -802,7 +807,7 @@ class MaybeTboDeepEPDispatcher:
|
|
802
807
|
def _execute(self, name, tbo_subbatch_index: Optional[int] = None, **kwargs):
|
803
808
|
return getattr(self._inners[tbo_subbatch_index or 0], name)(**kwargs)
|
804
809
|
|
805
|
-
def dispatch(self, **kwargs):
|
810
|
+
def dispatch(self, **kwargs) -> DispatchOutput:
|
806
811
|
return self._execute("dispatch", **kwargs)
|
807
812
|
|
808
813
|
def dispatch_a(self, **kwargs):
|
@@ -811,7 +816,7 @@ class MaybeTboDeepEPDispatcher:
|
|
811
816
|
def dispatch_b(self, **kwargs):
|
812
817
|
return self._execute("dispatch_b", **kwargs)
|
813
818
|
|
814
|
-
def combine(self, **kwargs):
|
819
|
+
def combine(self, **kwargs) -> torch.Tensor:
|
815
820
|
return self._execute("combine", **kwargs)
|
816
821
|
|
817
822
|
def combine_a(self, **kwargs):
|
sglang/test/test_utils.py
CHANGED
@@ -19,6 +19,7 @@ from pathlib import Path
|
|
19
19
|
from types import SimpleNamespace
|
20
20
|
from typing import Awaitable, Callable, List, Optional, Tuple
|
21
21
|
|
22
|
+
import aiohttp
|
22
23
|
import numpy as np
|
23
24
|
import requests
|
24
25
|
import torch
|
@@ -1303,6 +1304,58 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
|
|
1303
1304
|
raise
|
1304
1305
|
|
1305
1306
|
|
1307
|
+
def send_generate_requests(base_url: str, num_requests: int) -> List[str]:
|
1308
|
+
"""Sends generate request serially and returns status codes. Max concurrency is 1."""
|
1309
|
+
|
1310
|
+
def generate():
|
1311
|
+
prompt = """
|
1312
|
+
System: You are a helpful assistant.
|
1313
|
+
User: What is the capital of France?
|
1314
|
+
Assistant: The capital of France is
|
1315
|
+
"""
|
1316
|
+
response = requests.post(
|
1317
|
+
f"{base_url}/generate",
|
1318
|
+
json={
|
1319
|
+
"text": prompt,
|
1320
|
+
"sampling_params": {
|
1321
|
+
"temperature": 0,
|
1322
|
+
"max_new_tokens": 50,
|
1323
|
+
},
|
1324
|
+
},
|
1325
|
+
)
|
1326
|
+
return response.status_code
|
1327
|
+
|
1328
|
+
return [generate() for _ in range(num_requests)]
|
1329
|
+
|
1330
|
+
|
1331
|
+
async def send_concurrent_generate_requests(
|
1332
|
+
base_url: str, num_requests: int
|
1333
|
+
) -> List[str]:
|
1334
|
+
"""Sends generate request concurrently and returns status codes. Max concurrency is num_requests."""
|
1335
|
+
|
1336
|
+
async def async_generate():
|
1337
|
+
async with aiohttp.ClientSession() as session:
|
1338
|
+
prompt = """
|
1339
|
+
System: You are a helpful assistant.
|
1340
|
+
User: What is the capital of France?
|
1341
|
+
Assistant: The capital of France is
|
1342
|
+
"""
|
1343
|
+
async with session.post(
|
1344
|
+
f"{base_url}/generate",
|
1345
|
+
json={
|
1346
|
+
"text": prompt,
|
1347
|
+
"sampling_params": {
|
1348
|
+
"temperature": 0,
|
1349
|
+
"max_new_tokens": 50,
|
1350
|
+
},
|
1351
|
+
},
|
1352
|
+
) as response:
|
1353
|
+
return response.status
|
1354
|
+
|
1355
|
+
tasks = [asyncio.create_task(async_generate()) for _ in range(num_requests)]
|
1356
|
+
return await asyncio.gather(*tasks)
|
1357
|
+
|
1358
|
+
|
1306
1359
|
class CustomTestCase(unittest.TestCase):
|
1307
1360
|
def _callTestMethod(self, method):
|
1308
1361
|
max_retry = int(
|
sglang/utils.py
CHANGED
@@ -291,17 +291,6 @@ def find_printable_text(text: str):
|
|
291
291
|
return text[: text.rfind(" ") + 1]
|
292
292
|
|
293
293
|
|
294
|
-
def graceful_registry(sub_module_name: str):
|
295
|
-
def graceful_shutdown(signum, frame):
|
296
|
-
logger.info(
|
297
|
-
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
|
298
|
-
)
|
299
|
-
if signum == signal.SIGTERM:
|
300
|
-
logger.info(f"{sub_module_name} receive sigterm")
|
301
|
-
|
302
|
-
signal.signal(signal.SIGTERM, graceful_shutdown)
|
303
|
-
|
304
|
-
|
305
294
|
class LazyImport:
|
306
295
|
"""Lazy import to make `import sglang` run faster."""
|
307
296
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.10"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.10
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -246,14 +246,14 @@ Requires-Dist: sentencepiece; extra == "runtime-common"
|
|
246
246
|
Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
|
247
247
|
Requires-Dist: scipy; extra == "runtime-common"
|
248
248
|
Requires-Dist: torchao==0.9.0; extra == "runtime-common"
|
249
|
-
Requires-Dist: transformers==4.54.
|
249
|
+
Requires-Dist: transformers==4.54.1; extra == "runtime-common"
|
250
250
|
Requires-Dist: timm==1.0.16; extra == "runtime-common"
|
251
251
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
252
252
|
Requires-Dist: uvloop; extra == "runtime-common"
|
253
253
|
Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
|
254
254
|
Provides-Extra: srt
|
255
255
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
256
|
-
Requires-Dist: sgl-kernel==0.2.
|
256
|
+
Requires-Dist: sgl-kernel==0.2.8; extra == "srt"
|
257
257
|
Requires-Dist: torch==2.7.1; extra == "srt"
|
258
258
|
Requires-Dist: torchaudio==2.7.1; extra == "srt"
|
259
259
|
Requires-Dist: torchvision==0.22.1; extra == "srt"
|
@@ -269,6 +269,7 @@ Requires-Dist: torchvision==0.22.1; extra == "blackwell"
|
|
269
269
|
Requires-Dist: cuda-python; extra == "blackwell"
|
270
270
|
Requires-Dist: einops; extra == "blackwell"
|
271
271
|
Requires-Dist: flashinfer_python==0.2.9rc2; extra == "blackwell"
|
272
|
+
Requires-Dist: tiktoken; extra == "blackwell"
|
272
273
|
Provides-Extra: srt-hip
|
273
274
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
274
275
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -426,7 +427,6 @@ SGLang has been deployed at large scale, generating trillions of tokens in produ
|
|
426
427
|
<img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
|
427
428
|
|
428
429
|
## Contact Us
|
429
|
-
|
430
430
|
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
431
431
|
|
432
432
|
## Acknowledgment
|