sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/srt/configs/__init__.py +8 -0
  3. sglang/srt/configs/model_config.py +6 -0
  4. sglang/srt/configs/step3_vl.py +172 -0
  5. sglang/srt/conversation.py +23 -0
  6. sglang/srt/disaggregation/decode.py +2 -8
  7. sglang/srt/disaggregation/prefill.py +2 -6
  8. sglang/srt/distributed/parallel_state.py +86 -1
  9. sglang/srt/entrypoints/engine.py +14 -18
  10. sglang/srt/entrypoints/http_server.py +23 -3
  11. sglang/srt/entrypoints/openai/protocol.py +3 -1
  12. sglang/srt/entrypoints/openai/serving_base.py +5 -2
  13. sglang/srt/entrypoints/openai/serving_chat.py +2 -21
  14. sglang/srt/eplb/expert_distribution.py +5 -0
  15. sglang/srt/eplb/expert_location.py +17 -6
  16. sglang/srt/eplb/expert_location_dispatch.py +1 -0
  17. sglang/srt/eplb/expert_location_updater.py +2 -0
  18. sglang/srt/function_call/function_call_parser.py +2 -0
  19. sglang/srt/function_call/step3_detector.py +436 -0
  20. sglang/srt/hf_transformers_utils.py +2 -0
  21. sglang/srt/jinja_template_utils.py +4 -1
  22. sglang/srt/layers/moe/cutlass_moe.py +2 -1
  23. sglang/srt/layers/moe/ep_moe/layer.py +98 -603
  24. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
  25. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
  28. sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
  29. sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
  30. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
  31. sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
  32. sglang/srt/layers/moe/topk.py +6 -2
  33. sglang/srt/layers/quantization/fp8.py +0 -18
  34. sglang/srt/layers/quantization/modelopt_quant.py +2 -0
  35. sglang/srt/layers/quantization/unquant.py +0 -8
  36. sglang/srt/layers/quantization/w4afp8.py +1 -0
  37. sglang/srt/managers/cache_controller.py +143 -45
  38. sglang/srt/managers/data_parallel_controller.py +6 -0
  39. sglang/srt/managers/io_struct.py +12 -2
  40. sglang/srt/managers/scheduler.py +116 -669
  41. sglang/srt/managers/scheduler_input_blocker.py +106 -0
  42. sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
  43. sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
  44. sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
  45. sglang/srt/managers/template_manager.py +62 -19
  46. sglang/srt/managers/tokenizer_manager.py +166 -83
  47. sglang/srt/managers/tp_worker.py +9 -0
  48. sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
  49. sglang/srt/mem_cache/hicache_storage.py +45 -11
  50. sglang/srt/mem_cache/hiradix_cache.py +15 -4
  51. sglang/srt/mem_cache/memory_pool_host.py +73 -1
  52. sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
  53. sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
  54. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
  55. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
  56. sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
  57. sglang/srt/model_executor/model_runner.py +20 -13
  58. sglang/srt/models/arcee.py +532 -0
  59. sglang/srt/models/deepseek_v2.py +15 -56
  60. sglang/srt/models/glm4_moe.py +3 -1
  61. sglang/srt/models/granitemoe.py +3 -0
  62. sglang/srt/models/grok.py +3 -0
  63. sglang/srt/models/hunyuan.py +1 -0
  64. sglang/srt/models/llama4.py +3 -0
  65. sglang/srt/models/mixtral.py +3 -0
  66. sglang/srt/models/olmoe.py +3 -0
  67. sglang/srt/models/phimoe.py +1 -0
  68. sglang/srt/models/qwen3_moe.py +12 -69
  69. sglang/srt/models/step3_vl.py +994 -0
  70. sglang/srt/multimodal/processors/base_processor.py +15 -16
  71. sglang/srt/multimodal/processors/step3_vl.py +515 -0
  72. sglang/srt/poll_based_barrier.py +31 -0
  73. sglang/srt/reasoning_parser.py +2 -1
  74. sglang/srt/server_args.py +18 -13
  75. sglang/srt/speculative/eagle_worker.py +2 -0
  76. sglang/srt/two_batch_overlap.py +8 -3
  77. sglang/test/test_utils.py +53 -0
  78. sglang/utils.py +0 -11
  79. sglang/version.py +1 -1
  80. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
  81. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
  82. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
  83. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
  84. {sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -19,6 +19,7 @@ import json
19
19
  import logging
20
20
  import os
21
21
  import random
22
+ import sys
22
23
  import tempfile
23
24
  from typing import List, Literal, Optional, Union
24
25
 
@@ -74,6 +75,7 @@ class ServerArgs:
74
75
  # Memory and scheduling
75
76
  mem_fraction_static: Optional[float] = None
76
77
  max_running_requests: Optional[int] = None
78
+ max_queued_requests: Optional[int] = sys.maxsize
77
79
  max_total_tokens: Optional[int] = None
78
80
  chunked_prefill_size: Optional[int] = None
79
81
  max_prefill_tokens: int = 16384
@@ -268,14 +270,6 @@ class ServerArgs:
268
270
  sm_group_num: int = 3
269
271
 
270
272
  def __post_init__(self):
271
- # Expert parallelism
272
- # We put it here first due to some internal ckpt conversation issues.
273
- if self.enable_ep_moe:
274
- self.ep_size = self.tp_size
275
- logger.warning(
276
- f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
277
- )
278
-
279
273
  # Set missing default values
280
274
  if self.tokenizer_path is None:
281
275
  self.tokenizer_path = self.model_path
@@ -805,6 +799,12 @@ class ServerArgs:
805
799
  default=ServerArgs.max_running_requests,
806
800
  help="The maximum number of running requests.",
807
801
  )
802
+ parser.add_argument(
803
+ "--max-queued-requests",
804
+ type=int,
805
+ default=ServerArgs.max_queued_requests,
806
+ help="The maximum number of queued requests. This option is ignored when using disaggregation-mode.",
807
+ )
808
808
  parser.add_argument(
809
809
  "--max-total-tokens",
810
810
  type=int,
@@ -1109,9 +1109,10 @@ class ServerArgs:
1109
1109
  "kimi_k2",
1110
1110
  "qwen3_coder",
1111
1111
  "glm45",
1112
+ "step3",
1112
1113
  ],
1113
1114
  default=ServerArgs.tool_call_parser,
1114
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
1115
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
1115
1116
  )
1116
1117
 
1117
1118
  # Data parallelism
@@ -1326,6 +1327,7 @@ class ServerArgs:
1326
1327
  parser.add_argument(
1327
1328
  "--expert-parallel-size",
1328
1329
  "--ep-size",
1330
+ "--ep",
1329
1331
  type=int,
1330
1332
  default=ServerArgs.ep_size,
1331
1333
  help="The expert parallelism size.",
@@ -1468,7 +1470,7 @@ class ServerArgs:
1468
1470
  parser.add_argument(
1469
1471
  "--hicache-storage-backend",
1470
1472
  type=str,
1471
- choices=["file"], # todo, mooncake
1473
+ choices=["file", "mooncake", "hf3fs"],
1472
1474
  default=ServerArgs.hicache_storage_backend,
1473
1475
  help="The storage backend for hierarchical KV cache.",
1474
1476
  )
@@ -2063,6 +2065,9 @@ class PortArgs:
2063
2065
 
2064
2066
  dist_init_host, dist_init_port = dist_init_addr
2065
2067
  port_base = int(dist_init_port) + 1
2068
+ detokenizer_port = port_base + 1
2069
+ rpc_port = port_base + 2
2070
+ metrics_ipc_name = port_base + 3
2066
2071
  if dp_rank is None:
2067
2072
  # TokenizerManager to DataParallelController
2068
2073
  scheduler_input_port = port_base + 4
@@ -2072,10 +2077,10 @@ class PortArgs:
2072
2077
  return PortArgs(
2073
2078
  tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
2074
2079
  scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
2075
- detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
2080
+ detokenizer_ipc_name=f"tcp://{dist_init_host}:{detokenizer_port}",
2076
2081
  nccl_port=nccl_port,
2077
- rpc_ipc_name=f"tcp://{dist_init_host}:{port_base + 2}",
2078
- metrics_ipc_name=f"tcp://{dist_init_host}:{port_base + 3}",
2082
+ rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
2083
+ metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
2079
2084
  )
2080
2085
 
2081
2086
 
@@ -73,6 +73,7 @@ class EAGLEWorker(TpModelWorker):
73
73
  gpu_id: int,
74
74
  tp_rank: int,
75
75
  dp_rank: Optional[int],
76
+ moe_ep_rank: int,
76
77
  nccl_port: int,
77
78
  target_worker: TpModelWorker,
78
79
  ):
@@ -127,6 +128,7 @@ class EAGLEWorker(TpModelWorker):
127
128
  tp_rank=tp_rank,
128
129
  pp_rank=0, # FIXME
129
130
  dp_rank=dp_rank,
131
+ moe_ep_rank=moe_ep_rank,
130
132
  nccl_port=nccl_port,
131
133
  is_draft_worker=True,
132
134
  req_to_token_pool=self.req_to_token_pool,
@@ -1,7 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import dataclasses
2
4
  import logging
3
5
  from dataclasses import replace
4
- from typing import Dict, List, Optional, Sequence, Union
6
+ from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Union
5
7
 
6
8
  import torch
7
9
 
@@ -20,6 +22,9 @@ from sglang.srt.operations_strategy import OperationsStrategy
20
22
  from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
21
23
  from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
22
24
 
25
+ if TYPE_CHECKING:
26
+ from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
27
+
23
28
  _tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
24
29
 
25
30
  logger = logging.getLogger(__name__)
@@ -802,7 +807,7 @@ class MaybeTboDeepEPDispatcher:
802
807
  def _execute(self, name, tbo_subbatch_index: Optional[int] = None, **kwargs):
803
808
  return getattr(self._inners[tbo_subbatch_index or 0], name)(**kwargs)
804
809
 
805
- def dispatch(self, **kwargs):
810
+ def dispatch(self, **kwargs) -> DispatchOutput:
806
811
  return self._execute("dispatch", **kwargs)
807
812
 
808
813
  def dispatch_a(self, **kwargs):
@@ -811,7 +816,7 @@ class MaybeTboDeepEPDispatcher:
811
816
  def dispatch_b(self, **kwargs):
812
817
  return self._execute("dispatch_b", **kwargs)
813
818
 
814
- def combine(self, **kwargs):
819
+ def combine(self, **kwargs) -> torch.Tensor:
815
820
  return self._execute("combine", **kwargs)
816
821
 
817
822
  def combine_a(self, **kwargs):
sglang/test/test_utils.py CHANGED
@@ -19,6 +19,7 @@ from pathlib import Path
19
19
  from types import SimpleNamespace
20
20
  from typing import Awaitable, Callable, List, Optional, Tuple
21
21
 
22
+ import aiohttp
22
23
  import numpy as np
23
24
  import requests
24
25
  import torch
@@ -1303,6 +1304,58 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
1303
1304
  raise
1304
1305
 
1305
1306
 
1307
+ def send_generate_requests(base_url: str, num_requests: int) -> List[str]:
1308
+ """Sends generate request serially and returns status codes. Max concurrency is 1."""
1309
+
1310
+ def generate():
1311
+ prompt = """
1312
+ System: You are a helpful assistant.
1313
+ User: What is the capital of France?
1314
+ Assistant: The capital of France is
1315
+ """
1316
+ response = requests.post(
1317
+ f"{base_url}/generate",
1318
+ json={
1319
+ "text": prompt,
1320
+ "sampling_params": {
1321
+ "temperature": 0,
1322
+ "max_new_tokens": 50,
1323
+ },
1324
+ },
1325
+ )
1326
+ return response.status_code
1327
+
1328
+ return [generate() for _ in range(num_requests)]
1329
+
1330
+
1331
+ async def send_concurrent_generate_requests(
1332
+ base_url: str, num_requests: int
1333
+ ) -> List[str]:
1334
+ """Sends generate request concurrently and returns status codes. Max concurrency is num_requests."""
1335
+
1336
+ async def async_generate():
1337
+ async with aiohttp.ClientSession() as session:
1338
+ prompt = """
1339
+ System: You are a helpful assistant.
1340
+ User: What is the capital of France?
1341
+ Assistant: The capital of France is
1342
+ """
1343
+ async with session.post(
1344
+ f"{base_url}/generate",
1345
+ json={
1346
+ "text": prompt,
1347
+ "sampling_params": {
1348
+ "temperature": 0,
1349
+ "max_new_tokens": 50,
1350
+ },
1351
+ },
1352
+ ) as response:
1353
+ return response.status
1354
+
1355
+ tasks = [asyncio.create_task(async_generate()) for _ in range(num_requests)]
1356
+ return await asyncio.gather(*tasks)
1357
+
1358
+
1306
1359
  class CustomTestCase(unittest.TestCase):
1307
1360
  def _callTestMethod(self, method):
1308
1361
  max_retry = int(
sglang/utils.py CHANGED
@@ -291,17 +291,6 @@ def find_printable_text(text: str):
291
291
  return text[: text.rfind(" ") + 1]
292
292
 
293
293
 
294
- def graceful_registry(sub_module_name: str):
295
- def graceful_shutdown(signum, frame):
296
- logger.info(
297
- f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
298
- )
299
- if signum == signal.SIGTERM:
300
- logger.info(f"{sub_module_name} receive sigterm")
301
-
302
- signal.signal(signal.SIGTERM, graceful_shutdown)
303
-
304
-
305
294
  class LazyImport:
306
295
  """Lazy import to make `import sglang` run faster."""
307
296
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.9.post5"
1
+ __version__ = "0.4.10"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.9.post5
3
+ Version: 0.4.10
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -246,14 +246,14 @@ Requires-Dist: sentencepiece; extra == "runtime-common"
246
246
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
247
247
  Requires-Dist: scipy; extra == "runtime-common"
248
248
  Requires-Dist: torchao==0.9.0; extra == "runtime-common"
249
- Requires-Dist: transformers==4.54.0; extra == "runtime-common"
249
+ Requires-Dist: transformers==4.54.1; extra == "runtime-common"
250
250
  Requires-Dist: timm==1.0.16; extra == "runtime-common"
251
251
  Requires-Dist: uvicorn; extra == "runtime-common"
252
252
  Requires-Dist: uvloop; extra == "runtime-common"
253
253
  Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
254
254
  Provides-Extra: srt
255
255
  Requires-Dist: sglang[runtime_common]; extra == "srt"
256
- Requires-Dist: sgl-kernel==0.2.7; extra == "srt"
256
+ Requires-Dist: sgl-kernel==0.2.8; extra == "srt"
257
257
  Requires-Dist: torch==2.7.1; extra == "srt"
258
258
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
259
259
  Requires-Dist: torchvision==0.22.1; extra == "srt"
@@ -269,6 +269,7 @@ Requires-Dist: torchvision==0.22.1; extra == "blackwell"
269
269
  Requires-Dist: cuda-python; extra == "blackwell"
270
270
  Requires-Dist: einops; extra == "blackwell"
271
271
  Requires-Dist: flashinfer_python==0.2.9rc2; extra == "blackwell"
272
+ Requires-Dist: tiktoken; extra == "blackwell"
272
273
  Provides-Extra: srt-hip
273
274
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
274
275
  Requires-Dist: torch; extra == "srt-hip"
@@ -426,7 +427,6 @@ SGLang has been deployed at large scale, generating trillions of tokens in produ
426
427
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
427
428
 
428
429
  ## Contact Us
429
-
430
430
  For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
431
431
 
432
432
  ## Acknowledgment