sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. sglang/bench_serving.py +56 -12
  2. sglang/launch_server.py +2 -0
  3. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
  4. sglang/srt/compilation/backend.py +1 -1
  5. sglang/srt/configs/model_config.py +5 -5
  6. sglang/srt/distributed/parallel_state.py +0 -7
  7. sglang/srt/entrypoints/engine.py +18 -15
  8. sglang/srt/entrypoints/grpc_server.py +0 -1
  9. sglang/srt/entrypoints/http_server.py +75 -94
  10. sglang/srt/environ.py +16 -2
  11. sglang/srt/eplb/expert_distribution.py +30 -0
  12. sglang/srt/function_call/function_call_parser.py +2 -0
  13. sglang/srt/function_call/minimax_m2.py +367 -0
  14. sglang/srt/layers/activation.py +6 -0
  15. sglang/srt/layers/attention/flashattention_backend.py +12 -2
  16. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  17. sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
  18. sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
  19. sglang/srt/layers/attention/utils.py +78 -0
  20. sglang/srt/layers/communicator.py +1 -0
  21. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  22. sglang/srt/layers/layernorm.py +19 -4
  23. sglang/srt/layers/logits_processor.py +5 -0
  24. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  25. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  26. sglang/srt/layers/moe/ep_moe/layer.py +79 -272
  27. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  28. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  29. sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
  30. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  31. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  32. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  33. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  34. sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
  35. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  36. sglang/srt/layers/moe/topk.py +4 -4
  37. sglang/srt/layers/moe/utils.py +3 -4
  38. sglang/srt/layers/quantization/__init__.py +3 -5
  39. sglang/srt/layers/quantization/awq.py +0 -3
  40. sglang/srt/layers/quantization/base_config.py +7 -0
  41. sglang/srt/layers/quantization/fp8.py +68 -63
  42. sglang/srt/layers/quantization/gguf.py +566 -0
  43. sglang/srt/layers/quantization/mxfp4.py +30 -38
  44. sglang/srt/layers/quantization/unquant.py +23 -45
  45. sglang/srt/layers/quantization/w4afp8.py +38 -2
  46. sglang/srt/layers/radix_attention.py +5 -2
  47. sglang/srt/layers/rotary_embedding.py +13 -1
  48. sglang/srt/layers/sampler.py +12 -1
  49. sglang/srt/managers/io_struct.py +3 -0
  50. sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
  51. sglang/srt/managers/scheduler.py +21 -15
  52. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  53. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  54. sglang/srt/managers/tokenizer_manager.py +11 -19
  55. sglang/srt/mem_cache/hicache_storage.py +7 -1
  56. sglang/srt/mem_cache/memory_pool.py +82 -0
  57. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  58. sglang/srt/model_executor/forward_batch_info.py +44 -3
  59. sglang/srt/model_executor/model_runner.py +1 -149
  60. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  61. sglang/srt/models/deepseek_v2.py +147 -44
  62. sglang/srt/models/glm4_moe.py +322 -354
  63. sglang/srt/models/glm4_moe_nextn.py +4 -14
  64. sglang/srt/models/glm4v_moe.py +29 -196
  65. sglang/srt/models/minimax_m2.py +922 -0
  66. sglang/srt/models/nvila.py +355 -0
  67. sglang/srt/models/nvila_lite.py +184 -0
  68. sglang/srt/models/qwen2.py +22 -1
  69. sglang/srt/models/qwen3.py +34 -4
  70. sglang/srt/models/qwen3_moe.py +2 -4
  71. sglang/srt/multimodal/processors/base_processor.py +1 -0
  72. sglang/srt/multimodal/processors/glm4v.py +1 -1
  73. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  74. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  75. sglang/srt/parser/reasoning_parser.py +28 -1
  76. sglang/srt/server_args.py +365 -186
  77. sglang/srt/single_batch_overlap.py +2 -7
  78. sglang/srt/utils/common.py +87 -42
  79. sglang/srt/utils/hf_transformers_utils.py +7 -3
  80. sglang/test/test_deterministic.py +235 -12
  81. sglang/test/test_deterministic_utils.py +2 -1
  82. sglang/version.py +1 -1
  83. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
  84. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
  85. sglang/srt/models/vila.py +0 -306
  86. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  87. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  88. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -20,7 +20,7 @@ This file implements HTTP APIs for the inference engine via fastapi.
20
20
  import asyncio
21
21
  import dataclasses
22
22
  import logging
23
- import multiprocessing as multiprocessing
23
+ import multiprocessing
24
24
  import os
25
25
  import tempfile
26
26
  import threading
@@ -165,6 +165,7 @@ async def init_multi_tokenizer() -> ServerArgs:
165
165
  server_args.api_key is None
166
166
  ), "API key is not supported in multi-tokenizer mode"
167
167
 
168
+ # Create a new ipc name for the current process
168
169
  port_args.tokenizer_ipc_name = (
169
170
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
170
171
  )
@@ -184,6 +185,7 @@ async def init_multi_tokenizer() -> ServerArgs:
184
185
  )
185
186
 
186
187
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
188
+
187
189
  set_global_state(
188
190
  _GlobalState(
189
191
  tokenizer_manager=tokenizer_manager,
@@ -192,36 +194,35 @@ async def init_multi_tokenizer() -> ServerArgs:
192
194
  )
193
195
  )
194
196
 
195
- if server_args.enable_trace:
196
- process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
197
- if server_args.disaggregation_mode == "null":
198
- thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
199
- trace_set_thread_info(thread_label)
200
-
201
197
  return server_args
202
198
 
203
199
 
204
200
  @asynccontextmanager
205
201
  async def lifespan(fast_api_app: FastAPI):
206
- if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
202
+ if getattr(fast_api_app, "is_single_tokenizer_mode", False):
203
+ server_args = fast_api_app.server_args
204
+ warmup_thread_args = fast_api_app.warmup_thread_args
205
+ thread_label = "Tokenizer"
206
+ else:
207
207
  # Initialize multi-tokenizer support for worker processes
208
- fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
209
-
210
- # only metrics middleware is supported in multi-tokenizer mode
211
- worker_pid = os.getpid()
212
- if fast_api_app.server_args.enable_metrics:
213
- add_prometheus_middleware(app)
214
- enable_func_timer()
215
-
216
- logger.info(f"Worker {worker_pid} added prometheus middleware")
217
- fast_api_app.warmup_thread = threading.Thread(
218
- target=_wait_and_warmup,
219
- args=(
220
- fast_api_app.server_args,
221
- None, # pipe_finish_writer not needed in worker
222
- None, # launch_callback not needed in worker
223
- ),
208
+ server_args = await init_multi_tokenizer()
209
+ warmup_thread_args = (
210
+ server_args,
211
+ None,
212
+ None,
224
213
  )
214
+ thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}"
215
+
216
+ # Add prometheus middleware
217
+ if server_args.enable_metrics:
218
+ add_prometheus_middleware(app)
219
+ enable_func_timer()
220
+
221
+ # Init tracing
222
+ if server_args.enable_trace:
223
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
224
+ if server_args.disaggregation_mode == "null":
225
+ trace_set_thread_info(thread_label)
225
226
 
226
227
  # Initialize OpenAI serving handlers
227
228
  fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
@@ -249,8 +250,7 @@ async def lifespan(fast_api_app: FastAPI):
249
250
  _global_state.tokenizer_manager
250
251
  )
251
252
 
252
- server_args: ServerArgs = fast_api_app.server_args
253
-
253
+ # Launch tool server
254
254
  tool_server = None
255
255
  if server_args.tool_server == "demo":
256
256
  from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
@@ -274,12 +274,11 @@ async def lifespan(fast_api_app: FastAPI):
274
274
  enable_force_include_usage=True,
275
275
  tool_server=tool_server,
276
276
  )
277
- except Exception as e:
278
- import traceback
279
-
280
- traceback.print_exc()
281
- logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
277
+ except Exception:
278
+ traceback = get_exception_traceback()
279
+ logger.warning(f"Can not initialize OpenAIServingResponses, error: {traceback}")
282
280
 
281
+ # Execute custom warmups
283
282
  if server_args.warmups is not None:
284
283
  await execute_warmups(
285
284
  server_args.disaggregation_mode,
@@ -288,18 +287,18 @@ async def lifespan(fast_api_app: FastAPI):
288
287
  )
289
288
  logger.info("Warmup ended")
290
289
 
291
- warmup_thread = getattr(fast_api_app, "warmup_thread", None)
292
- if warmup_thread is not None:
293
- warmup_thread.start()
290
+ # Execute the general warmup
291
+ warmup_thread = threading.Thread(
292
+ target=_wait_and_warmup,
293
+ args=warmup_thread_args,
294
+ )
295
+ warmup_thread.start()
294
296
 
297
+ # Start the HTTP server
295
298
  try:
296
299
  yield
297
300
  finally:
298
- if server_args.tokenizer_worker_num > 1:
299
- pid = os.getpid()
300
- logger.info(f"uvicorn worker {pid} ending...")
301
- warmup_thread.join()
302
- logger.info(f"uvicorn worker {pid} ended.")
301
+ warmup_thread.join()
303
302
 
304
303
 
305
304
  # Fast API
@@ -499,6 +498,11 @@ async def get_server_info():
499
498
  internal_states: List[Dict[Any, Any]] = (
500
499
  await _global_state.tokenizer_manager.get_internal_state()
501
500
  )
501
+
502
+ # This field is not serializable.
503
+ if hasattr(_global_state.tokenizer_manager.server_args, "model_config"):
504
+ del _global_state.tokenizer_manager.server_args.model_config
505
+
502
506
  return {
503
507
  **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
504
508
  **_global_state.scheduler_info,
@@ -1328,27 +1332,12 @@ def launch_server(
1328
1332
  3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
1329
1333
 
1330
1334
  Note:
1331
- 1. The HTTP server, Engine, and TokenizerManager both run in the main process.
1335
+ 1. The HTTP server, Engine, and TokenizerManager all run in the main process.
1332
1336
  2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
1333
1337
  """
1334
- if server_args.tokenizer_worker_num > 1:
1335
- port_args = PortArgs.init_new(server_args)
1336
- port_args.tokenizer_worker_ipc_name = (
1337
- f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
1338
- )
1339
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1340
- server_args=server_args, port_args=port_args
1341
- )
1342
- else:
1343
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1344
- server_args=server_args,
1345
- )
1346
-
1347
- if server_args.enable_trace:
1348
- process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
1349
- if server_args.disaggregation_mode == "null":
1350
- thread_label = "Tokenizer"
1351
- trace_set_thread_info(thread_label)
1338
+ tokenizer_manager, template_manager, scheduler_info, port_args = (
1339
+ _launch_subprocesses(server_args=server_args)
1340
+ )
1352
1341
 
1353
1342
  set_global_state(
1354
1343
  _GlobalState(
@@ -1358,40 +1347,45 @@ def launch_server(
1358
1347
  )
1359
1348
  )
1360
1349
 
1361
- if server_args.tokenizer_worker_num > 1:
1362
- multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1363
- port_args,
1350
+ # Pass additional arguments to the lifespan function.
1351
+ # They will be used for additional initialization setups.
1352
+ if server_args.tokenizer_worker_num == 1:
1353
+ # If it is single tokenizer mode, we can pass the arguments by attributes of the app object.
1354
+ app.is_single_tokenizer_mode = True
1355
+ app.server_args = server_args
1356
+ app.warmup_thread_args = (
1364
1357
  server_args,
1365
- scheduler_info,
1358
+ pipe_finish_writer,
1359
+ launch_callback,
1366
1360
  )
1367
- else:
1361
+
1368
1362
  # Add api key authorization
1363
+ # This is only supported in single tokenizer mode.
1369
1364
  if server_args.api_key:
1370
1365
  add_api_key_middleware(app, server_args.api_key)
1371
-
1372
- # Add prometheus middleware
1373
- if server_args.enable_metrics:
1374
- add_prometheus_middleware(app)
1375
- enable_func_timer()
1376
-
1377
- # Send a warmup request - we will create the thread launch it
1378
- # in the lifespan after all other warmups have fired.
1379
- warmup_thread = threading.Thread(
1380
- target=_wait_and_warmup,
1381
- args=(
1382
- server_args,
1383
- pipe_finish_writer,
1384
- launch_callback,
1385
- ),
1366
+ else:
1367
+ # If it is multi-tokenizer mode, we need to write the arguments to shared memory
1368
+ # for other worker processes to read.
1369
+ app.is_single_tokenizer_mode = False
1370
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1371
+ port_args, server_args, scheduler_info
1386
1372
  )
1387
- app.warmup_thread = warmup_thread
1388
1373
 
1389
1374
  try:
1390
1375
  # Update logging configs
1391
1376
  set_uvicorn_logging_configs()
1392
- app.server_args = server_args
1377
+
1393
1378
  # Listen for HTTP requests
1394
- if server_args.tokenizer_worker_num > 1:
1379
+ if server_args.tokenizer_worker_num == 1:
1380
+ uvicorn.run(
1381
+ app,
1382
+ host=server_args.host,
1383
+ port=server_args.port,
1384
+ log_level=server_args.log_level_http or server_args.log_level,
1385
+ timeout_keep_alive=5,
1386
+ loop="uvloop",
1387
+ )
1388
+ else:
1395
1389
  from uvicorn.config import LOGGING_CONFIG
1396
1390
 
1397
1391
  LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
@@ -1399,7 +1393,6 @@ def launch_server(
1399
1393
  "level": "INFO",
1400
1394
  "propagate": False,
1401
1395
  }
1402
-
1403
1396
  monkey_patch_uvicorn_multiprocessing()
1404
1397
 
1405
1398
  uvicorn.run(
@@ -1411,22 +1404,10 @@ def launch_server(
1411
1404
  loop="uvloop",
1412
1405
  workers=server_args.tokenizer_worker_num,
1413
1406
  )
1414
- else:
1415
- app.is_single_tokenizer_mode = True
1416
- uvicorn.run(
1417
- app,
1418
- host=server_args.host,
1419
- port=server_args.port,
1420
- log_level=server_args.log_level_http or server_args.log_level,
1421
- timeout_keep_alive=5,
1422
- loop="uvloop",
1423
- )
1424
1407
  finally:
1425
1408
  if server_args.tokenizer_worker_num > 1:
1426
1409
  multi_tokenizer_args_shm.unlink()
1427
1410
  _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1428
- else:
1429
- warmup_thread.join()
1430
1411
 
1431
1412
 
1432
1413
  def _execute_server_warmup(
sglang/srt/environ.py CHANGED
@@ -111,18 +111,21 @@ class Envs:
111
111
  # Model & File Download
112
112
  SGLANG_USE_MODELSCOPE = EnvBool(False)
113
113
 
114
+ # Logging Options
115
+ SGLANG_LOG_GC = EnvBool(False)
116
+ SGLANG_LOG_FORWARD_ITERS = EnvBool(False)
117
+ SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
118
+
114
119
  # Test & Debug
115
120
  SGLANG_IS_IN_CI = EnvBool(False)
116
121
  SGLANG_IS_IN_CI_AMD = EnvBool(False)
117
122
  SGLANG_SET_CPU_AFFINITY = EnvBool(False)
118
123
  SGLANG_PROFILE_WITH_STACK = EnvBool(True)
119
124
  SGLANG_RECORD_STEP_TIME = EnvBool(False)
120
- SGLANG_GC_LOG = EnvBool(False)
121
125
  SGLANG_FORCE_SHUTDOWN = EnvBool(False)
122
126
  SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
123
127
  SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
124
128
  SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
125
- SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
126
129
  SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
127
130
  SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
128
131
  SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
@@ -228,6 +231,7 @@ class Envs:
228
231
  SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
229
232
 
230
233
  # Overlap Spec V2
234
+ SGLANG_ENABLE_SPEC_V2 = EnvBool(False)
231
235
  SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
232
236
 
233
237
  # VLM
@@ -251,7 +255,17 @@ class Envs:
251
255
  envs = Envs()
252
256
 
253
257
 
258
+ def _print_deprecated_env(new_name: str, old_name: str):
259
+ if old_name in os.environ:
260
+ warnings.warn(
261
+ f"Environment variable {old_name} will be deprecated, please use {new_name} instead"
262
+ )
263
+ os.environ[new_name] = os.environ[old_name]
264
+
265
+
254
266
  def _convert_SGL_to_SGLANG():
267
+ _print_deprecated_env("SGLANG_LOG_GC", "SGLANG_GC_LOG")
268
+
255
269
  for key, value in os.environ.items():
256
270
  if key.startswith("SGL_"):
257
271
  new_key = key.replace("SGL_", "SGLANG_", 1)
@@ -415,10 +415,19 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
415
415
 
416
416
  def collect(self) -> Dict:
417
417
  num_tokens = len(self._metadata["input_ids"])
418
+
419
+ global_physical_count = _convert_per_token_to_global_physical_count(
420
+ num_tokens,
421
+ num_layers=self._expert_location_metadata.num_layers,
422
+ num_physical_experts=self._expert_location_metadata.num_physical_experts,
423
+ _topk_ids_of_layer=self._topk_ids_of_layer,
424
+ )
425
+
418
426
  return dict(
419
427
  **self._metadata,
420
428
  topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
421
429
  misc_objects=self._misc_objects,
430
+ global_physical_count=global_physical_count,
422
431
  )
423
432
 
424
433
 
@@ -547,6 +556,27 @@ class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
547
556
  self._data[layer_idx, :] += local_physical_count_of_layer
548
557
 
549
558
 
559
+ def _convert_per_token_to_global_physical_count(
560
+ num_tokens: int,
561
+ num_layers: int,
562
+ num_physical_experts: int,
563
+ _topk_ids_of_layer: torch.Tensor,
564
+ ) -> torch.Tensor:
565
+ topk_ids_layer_major = _topk_ids_of_layer[:, :num_tokens, :].reshape(num_layers, -1)
566
+ mask = topk_ids_layer_major != -1
567
+
568
+ index = topk_ids_layer_major.masked_fill(~mask, 0).long()
569
+ src = mask.int()
570
+
571
+ ans = torch.zeros(
572
+ (num_layers, num_physical_experts),
573
+ dtype=_topk_ids_of_layer.dtype,
574
+ device=_topk_ids_of_layer.device,
575
+ )
576
+ ans.scatter_add_(dim=1, index=index, src=src)
577
+ return ans
578
+
579
+
550
580
  def _convert_local_to_global_physical_count(
551
581
  local_physical_count: torch.Tensor,
552
582
  rank: int,
@@ -16,6 +16,7 @@ from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
16
16
  from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
17
17
  from sglang.srt.function_call.kimik2_detector import KimiK2Detector
18
18
  from sglang.srt.function_call.llama32_detector import Llama32Detector
19
+ from sglang.srt.function_call.minimax_m2 import MinimaxM2Detector
19
20
  from sglang.srt.function_call.mistral_detector import MistralDetector
20
21
  from sglang.srt.function_call.pythonic_detector import PythonicDetector
21
22
  from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
@@ -49,6 +50,7 @@ class FunctionCallParser:
49
50
  "qwen25": Qwen25Detector,
50
51
  "qwen3_coder": Qwen3CoderDetector,
51
52
  "step3": Step3Detector,
53
+ "minimax-m2": MinimaxM2Detector,
52
54
  }
53
55
 
54
56
  def __init__(self, tools: List[Tool], tool_call_parser: str):