sglang 0.4.6__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. sglang/bench_one_batch.py +2 -0
  2. sglang/check_env.py +3 -3
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/kimi_vl.py +38 -0
  5. sglang/srt/configs/kimi_vl_moonvit.py +32 -0
  6. sglang/srt/configs/model_config.py +15 -0
  7. sglang/srt/conversation.py +122 -1
  8. sglang/srt/disaggregation/decode.py +8 -2
  9. sglang/srt/disaggregation/fake/__init__.py +1 -0
  10. sglang/srt/disaggregation/fake/conn.py +88 -0
  11. sglang/srt/disaggregation/prefill.py +12 -3
  12. sglang/srt/disaggregation/utils.py +16 -2
  13. sglang/srt/entrypoints/engine.py +52 -21
  14. sglang/srt/entrypoints/http_server.py +27 -2
  15. sglang/srt/function_call_parser.py +97 -0
  16. sglang/srt/hf_transformers_utils.py +2 -0
  17. sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
  18. sglang/srt/layers/attention/flashinfer_backend.py +107 -82
  19. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
  20. sglang/srt/layers/attention/flashmla_backend.py +3 -0
  21. sglang/srt/layers/attention/utils.py +1 -1
  22. sglang/srt/layers/dp_attention.py +5 -2
  23. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  36. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  37. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
  39. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
  40. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -8
  41. sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
  42. sglang/srt/layers/quantization/__init__.py +2 -2
  43. sglang/srt/layers/quantization/deep_gemm.py +1 -1
  44. sglang/srt/layers/quantization/fp8.py +20 -22
  45. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  46. sglang/srt/layers/utils.py +35 -0
  47. sglang/srt/lora/layers.py +35 -9
  48. sglang/srt/lora/lora_manager.py +84 -35
  49. sglang/srt/managers/data_parallel_controller.py +52 -34
  50. sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
  51. sglang/srt/managers/schedule_batch.py +34 -15
  52. sglang/srt/managers/scheduler.py +273 -67
  53. sglang/srt/managers/scheduler_output_processor_mixin.py +26 -10
  54. sglang/srt/managers/tp_worker.py +52 -17
  55. sglang/srt/managers/tp_worker_overlap_thread.py +18 -7
  56. sglang/srt/mem_cache/memory_pool.py +70 -36
  57. sglang/srt/model_executor/cuda_graph_runner.py +82 -19
  58. sglang/srt/model_executor/forward_batch_info.py +31 -1
  59. sglang/srt/model_executor/model_runner.py +123 -58
  60. sglang/srt/models/deepseek_nextn.py +1 -257
  61. sglang/srt/models/deepseek_v2.py +78 -18
  62. sglang/srt/models/kimi_vl.py +308 -0
  63. sglang/srt/models/kimi_vl_moonvit.py +639 -0
  64. sglang/srt/models/llama.py +92 -30
  65. sglang/srt/models/llama4.py +2 -1
  66. sglang/srt/models/llama_eagle.py +4 -1
  67. sglang/srt/models/llama_eagle3.py +4 -1
  68. sglang/srt/models/qwen2_moe.py +8 -3
  69. sglang/srt/models/qwen2_vl.py +0 -12
  70. sglang/srt/models/qwen3_moe.py +8 -3
  71. sglang/srt/openai_api/adapter.py +49 -8
  72. sglang/srt/openai_api/protocol.py +13 -1
  73. sglang/srt/reasoning_parser.py +25 -1
  74. sglang/srt/server_args.py +83 -24
  75. sglang/srt/speculative/eagle_worker.py +3 -2
  76. sglang/srt/utils.py +91 -9
  77. sglang/test/runners.py +4 -0
  78. sglang/test/send_one.py +84 -28
  79. sglang/test/test_utils.py +67 -0
  80. sglang/version.py +1 -1
  81. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
  82. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +85 -60
  83. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
  84. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
  85. {sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -22,7 +22,7 @@ import random
22
22
  import tempfile
23
23
  from typing import List, Literal, Optional
24
24
 
25
- from sglang.srt.hf_transformers_utils import check_gguf_file
25
+ from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
26
26
  from sglang.srt.reasoning_parser import ReasoningParser
27
27
  from sglang.srt.utils import (
28
28
  configure_ipv6,
@@ -78,6 +78,8 @@ class ServerArgs:
78
78
 
79
79
  # Other runtime options
80
80
  tp_size: int = 1
81
+ pp_size: int = 1
82
+ max_micro_batch_size: Optional[int] = None
81
83
  stream_interval: int = 1
82
84
  stream_output: bool = False
83
85
  random_seed: Optional[int] = None
@@ -222,25 +224,34 @@ class ServerArgs:
222
224
 
223
225
  # Set mem fraction static, which depends on the tensor parallelism size
224
226
  if self.mem_fraction_static is None:
227
+ parallel_size = self.tp_size * self.pp_size
225
228
  if gpu_mem <= 81920:
226
- if self.tp_size >= 16:
229
+ if parallel_size >= 16:
227
230
  self.mem_fraction_static = 0.79
228
- elif self.tp_size >= 8:
231
+ elif parallel_size >= 8:
229
232
  self.mem_fraction_static = 0.81
230
- elif self.tp_size >= 4:
233
+ elif parallel_size >= 4:
231
234
  self.mem_fraction_static = 0.85
232
- elif self.tp_size >= 2:
235
+ elif parallel_size >= 2:
233
236
  self.mem_fraction_static = 0.87
234
237
  else:
235
238
  self.mem_fraction_static = 0.88
236
239
  else:
237
- # FIXME: more fine grained auto-selection polices
238
- self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
240
+ self.mem_fraction_static = 0.88
241
+ if gpu_mem > 96 * 1024:
242
+ mem_fraction = self.mem_fraction_static
243
+ self.mem_fraction_static = min(
244
+ mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
245
+ (gpu_mem - 1024 * 18)
246
+ / gpu_mem, # 15 GB + additional 3GB for cuda graph
247
+ )
239
248
 
240
249
  # Set chunked prefill size, which depends on the gpu memory capacity
241
250
  if self.chunked_prefill_size is None:
242
251
  if gpu_mem is not None and gpu_mem < 25_000:
243
252
  self.chunked_prefill_size = 2048
253
+ elif self.disaggregation_mode != "null":
254
+ self.chunked_prefill_size = 16384
244
255
  else:
245
256
  self.chunked_prefill_size = 8192
246
257
  assert self.chunked_prefill_size % self.page_size == 0
@@ -256,6 +267,12 @@ class ServerArgs:
256
267
  )
257
268
  self.page_size = 64
258
269
 
270
+ if self.attention_backend == "cutlass_mla":
271
+ logger.warning(
272
+ "Cutlass MLA only supports a page_size of 128, change page_size to 128."
273
+ )
274
+ self.page_size = 128
275
+
259
276
  # Set cuda graph max batch size
260
277
  if self.cuda_graph_max_bs is None:
261
278
  # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
@@ -327,6 +344,14 @@ class ServerArgs:
327
344
  "eagle speculative decoding."
328
345
  )
329
346
 
347
+ model_arch = get_model_arch(self)
348
+
349
+ # Auto set draft_model_path DeepSeek-V3/R1
350
+ if self.speculative_draft_model_path is None and model_arch in [
351
+ "DeepseekV3ForCausalLM"
352
+ ]:
353
+ self.speculative_draft_model_path = self.model_path
354
+
330
355
  # Auto choose parameters
331
356
  if self.speculative_num_steps is None:
332
357
  assert (
@@ -337,7 +362,7 @@ class ServerArgs:
337
362
  self.speculative_num_steps,
338
363
  self.speculative_eagle_topk,
339
364
  self.speculative_num_draft_tokens,
340
- ) = auto_choose_speculative_params(self)
365
+ ) = auto_choose_speculative_params(model_arch)
341
366
 
342
367
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
343
368
  self.speculative_eagle_topk = 1
@@ -420,7 +445,7 @@ class ServerArgs:
420
445
  parser.add_argument(
421
446
  "--skip-tokenizer-init",
422
447
  action="store_true",
423
- help="If set, skip init tokenizer and pass input_ids in generate request",
448
+ help="If set, skip init tokenizer and pass input_ids in generate request.",
424
449
  )
425
450
  parser.add_argument(
426
451
  "--enable-tokenizer-batch-encode",
@@ -559,6 +584,7 @@ class ServerArgs:
559
584
  "name, a tag name, or a commit id. If unspecified, will use "
560
585
  "the default version.",
561
586
  )
587
+
562
588
  # Memory and scheduling
563
589
  parser.add_argument(
564
590
  "--mem-fraction-static",
@@ -625,6 +651,19 @@ class ServerArgs:
625
651
  default=ServerArgs.tp_size,
626
652
  help="The tensor parallelism size.",
627
653
  )
654
+ parser.add_argument(
655
+ "--pipeline-parallel-size",
656
+ "--pp-size",
657
+ type=int,
658
+ default=ServerArgs.pp_size,
659
+ help="The pipeline parallelism size.",
660
+ )
661
+ parser.add_argument(
662
+ "--max-micro-batch-size",
663
+ type=int,
664
+ default=ServerArgs.max_micro_batch_size,
665
+ help="The maximum micro batch size in pipeline parallelism.",
666
+ )
628
667
  parser.add_argument(
629
668
  "--stream-interval",
630
669
  type=int,
@@ -823,7 +862,14 @@ class ServerArgs:
823
862
  parser.add_argument(
824
863
  "--attention-backend",
825
864
  type=str,
826
- choices=["flashinfer", "triton", "torch_native", "fa3", "flashmla"],
865
+ choices=[
866
+ "flashinfer",
867
+ "triton",
868
+ "torch_native",
869
+ "fa3",
870
+ "flashmla",
871
+ "cutlass_mla",
872
+ ],
827
873
  default=ServerArgs.attention_backend,
828
874
  help="Choose the kernels for attention layers.",
829
875
  )
@@ -1082,9 +1128,9 @@ class ServerArgs:
1082
1128
  parser.add_argument(
1083
1129
  "--tool-call-parser",
1084
1130
  type=str,
1085
- choices=["qwen25", "mistral", "llama3", "deepseekv3"],
1131
+ choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
1086
1132
  default=ServerArgs.tool_call_parser,
1087
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
1133
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
1088
1134
  )
1089
1135
  parser.add_argument(
1090
1136
  "--enable-hierarchical-cache",
@@ -1207,6 +1253,7 @@ class ServerArgs:
1207
1253
  @classmethod
1208
1254
  def from_cli_args(cls, args: argparse.Namespace):
1209
1255
  args.tp_size = args.tensor_parallel_size
1256
+ args.pp_size = args.pipeline_parallel_size
1210
1257
  args.dp_size = args.data_parallel_size
1211
1258
  args.ep_size = args.expert_parallel_size
1212
1259
  attrs = [attr.name for attr in dataclasses.fields(cls)]
@@ -1220,15 +1267,25 @@ class ServerArgs:
1220
1267
 
1221
1268
  def check_server_args(self):
1222
1269
  assert (
1223
- self.tp_size % self.nnodes == 0
1224
- ), "tp_size must be divisible by number of nodes"
1270
+ self.tp_size * self.pp_size
1271
+ ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
1272
+
1273
+ # FIXME pp constraints
1274
+ if self.pp_size > 1:
1275
+ logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
1276
+ self.disable_overlap_schedule = True
1277
+ assert (
1278
+ self.disable_overlap_schedule
1279
+ and self.speculative_algorithm is None
1280
+ and not self.enable_mixed_chunk
1281
+ ), "Pipeline parallelism is not compatible with overlap schedule, speculative decoding, mixed chunked prefill."
1282
+
1225
1283
  assert not (
1226
1284
  self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
1227
1285
  ), "multi-node data parallel is not supported unless dp attention!"
1228
1286
  assert (
1229
1287
  self.max_loras_per_batch > 0
1230
1288
  # FIXME
1231
- and (self.lora_paths is None or self.disable_cuda_graph)
1232
1289
  and (self.lora_paths is None or self.disable_radix_cache)
1233
1290
  ), "compatibility of lora and cuda graph and radix attention is in progress"
1234
1291
  assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
@@ -1354,20 +1411,22 @@ class DeprecatedAction(argparse.Action):
1354
1411
  raise ValueError(self.help)
1355
1412
 
1356
1413
 
1357
- def auto_choose_speculative_params(self: ServerArgs):
1414
+ def get_model_arch(args: ServerArgs):
1415
+ hf_config = get_config(
1416
+ args.model_path,
1417
+ trust_remote_code=args.trust_remote_code,
1418
+ revision=args.revision,
1419
+ model_override_args=json.loads(args.json_model_override_args),
1420
+ )
1421
+ return hf_config.architectures[0]
1422
+
1423
+
1424
+ def auto_choose_speculative_params(arch: str):
1358
1425
  """
1359
1426
  Automatically choose the parameters for speculative decoding.
1360
1427
 
1361
1428
  You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
1362
1429
  """
1363
- config_path = os.path.join(self.model_path, "config.json")
1364
- if not os.path.exists(config_path):
1365
- raise ValueError(f"{config_path} is not found.")
1366
-
1367
- config = json.load(open(config_path))
1368
-
1369
- arch = config.get("architectures", ["Unknown"])[0]
1370
-
1371
1430
  if arch in ["LlamaForCausalLM"]:
1372
1431
  # The default value for llama
1373
1432
  return (5, 4, 8)
@@ -106,11 +106,12 @@ class EAGLEWorker(TpModelWorker):
106
106
  # Init draft worker
107
107
  with empty_context():
108
108
  super().__init__(
109
+ server_args=server_args,
109
110
  gpu_id=gpu_id,
110
111
  tp_rank=tp_rank,
111
- server_args=server_args,
112
- nccl_port=nccl_port,
112
+ pp_rank=0, # FIXME
113
113
  dp_rank=dp_rank,
114
+ nccl_port=nccl_port,
114
115
  is_draft_worker=True,
115
116
  req_to_token_pool=self.req_to_token_pool,
116
117
  token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
sglang/srt/utils.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Common utilities."""
15
+
15
16
  import base64
16
17
  import builtins
17
18
  import ctypes
@@ -414,16 +415,40 @@ class LayerFn(Protocol):
414
415
  def make_layers(
415
416
  num_hidden_layers: int,
416
417
  layer_fn: LayerFn,
418
+ pp_rank: Optional[int] = None,
419
+ pp_size: Optional[int] = None,
417
420
  prefix: str = "",
421
+ return_tuple: bool = False,
418
422
  ) -> Tuple[int, int, torch.nn.ModuleList]:
419
423
  """Make a list of layers with the given layer function"""
424
+ # circula imports
425
+ from sglang.srt.distributed import get_pp_indices
426
+ from sglang.srt.layers.utils import PPMissingLayer
427
+
428
+ assert not pp_size or num_hidden_layers >= pp_size
429
+ start_layer, end_layer = (
430
+ get_pp_indices(
431
+ num_hidden_layers,
432
+ pp_rank,
433
+ pp_size,
434
+ )
435
+ if pp_rank is not None and pp_size is not None
436
+ else (0, num_hidden_layers)
437
+ )
420
438
  modules = torch.nn.ModuleList(
421
- [
439
+ [PPMissingLayer(return_tuple=return_tuple) for _ in range(start_layer)]
440
+ + [
422
441
  maybe_offload_to_cpu(layer_fn(idx=idx, prefix=add_prefix(idx, prefix)))
423
- for idx in range(num_hidden_layers)
442
+ for idx in range(start_layer, end_layer)
443
+ ]
444
+ + [
445
+ PPMissingLayer(return_tuple=return_tuple)
446
+ for _ in range(end_layer, num_hidden_layers)
424
447
  ]
425
448
  )
426
- return modules
449
+ if pp_rank is None or pp_size is None:
450
+ return modules
451
+ return modules, start_layer, end_layer
427
452
 
428
453
 
429
454
  def set_random_seed(seed: int) -> None:
@@ -877,7 +902,7 @@ def broadcast_pyobj(
877
902
  "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu"
878
903
  )
879
904
 
880
- if rank == 0:
905
+ if rank == src:
881
906
  if len(data) == 0:
882
907
  tensor_size = torch.tensor([0], dtype=torch.long, device=device)
883
908
  dist.broadcast(tensor_size, src=src, group=dist_group)
@@ -909,6 +934,50 @@ def broadcast_pyobj(
909
934
  return data
910
935
 
911
936
 
937
+ def point_to_point_pyobj(
938
+ data: List[Any],
939
+ rank: int,
940
+ group: Optional[torch.distributed.ProcessGroup] = None,
941
+ src: int = 0,
942
+ dst: int = 1,
943
+ ):
944
+ """Send data from src to dst in group."""
945
+
946
+ if rank == src:
947
+ if len(data) == 0:
948
+ tensor_size = torch.tensor([0], dtype=torch.long)
949
+ dist.send(tensor_size, dst=dst, group=group)
950
+ else:
951
+ serialized_data = pickle.dumps(data)
952
+ size = len(serialized_data)
953
+ tensor_data = torch.ByteTensor(
954
+ np.frombuffer(serialized_data, dtype=np.uint8)
955
+ )
956
+ tensor_size = torch.tensor([size], dtype=torch.long)
957
+
958
+ dist.send(tensor_size, dst=dst, group=group)
959
+ dist.send(tensor_data, dst=dst, group=group)
960
+ return data
961
+
962
+ elif rank == dst:
963
+ tensor_size = torch.tensor([0], dtype=torch.long)
964
+ dist.recv(tensor_size, src=src, group=group)
965
+ size = tensor_size.item()
966
+
967
+ if size == 0:
968
+ return []
969
+
970
+ tensor_data = torch.empty(size, dtype=torch.uint8)
971
+ dist.recv(tensor_data, src=src, group=group)
972
+
973
+ serialized_data = bytes(tensor_data.cpu().numpy())
974
+ data = pickle.loads(serialized_data)
975
+ return data
976
+
977
+ # Other ranks in pp_group do nothing
978
+ return []
979
+
980
+
912
981
  step_counter = 0
913
982
 
914
983
 
@@ -1732,6 +1801,13 @@ def configure_ipv6(dist_init_addr):
1732
1801
  return port, host
1733
1802
 
1734
1803
 
1804
+ def rank0_log(msg: str):
1805
+ from sglang.srt.distributed import get_tensor_model_parallel_rank
1806
+
1807
+ if get_tensor_model_parallel_rank() == 0:
1808
+ logger.info(msg)
1809
+
1810
+
1735
1811
  def rank0_print(msg: str):
1736
1812
  from sglang.srt.distributed import get_tensor_model_parallel_rank
1737
1813
 
@@ -1905,13 +1981,16 @@ def fast_topk(values, topk, dim):
1905
1981
  return torch.topk(values, topk, dim=dim)
1906
1982
 
1907
1983
 
1908
- def is_hopper_with_cuda_12_3():
1984
+ def _check(cc_major):
1909
1985
  if not is_cuda():
1910
1986
  return False
1911
- is_hopper = torch.cuda.get_device_capability()[0] == 9
1912
- cuda_version = torch.version.cuda.split(".")
1913
- is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3
1914
- return is_hopper and is_cuda_compatible
1987
+ return torch.cuda.get_device_capability()[0] == cc_major and tuple(
1988
+ map(int, torch.version.cuda.split(".")[:2])
1989
+ ) >= (12, 3)
1990
+
1991
+
1992
+ is_ampere_with_cuda_12_3 = lambda: _check(8)
1993
+ is_hopper_with_cuda_12_3 = lambda: _check(9)
1915
1994
 
1916
1995
 
1917
1996
  def get_free_port():
@@ -1970,8 +2049,11 @@ def is_fa3_default_architecture(hf_config):
1970
2049
  "Llama4ForConditionalGeneration",
1971
2050
  "LlamaForCausalLM",
1972
2051
  "MistralForCausalLM",
2052
+ "MixtralForCausalLM",
1973
2053
  "Gemma2ForCausalLM",
1974
2054
  "Gemma3ForConditionalGeneration",
2055
+ "Qwen3ForCausalLM",
2056
+ "Qwen3MoeForCausalLM",
1975
2057
  }
1976
2058
  return architectures[0] in default_archs
1977
2059
 
sglang/test/runners.py CHANGED
@@ -423,6 +423,10 @@ class HFRunner:
423
423
  )
424
424
  del input_logits
425
425
 
426
+ if lora_paths is not None and lora_paths[i] is not None:
427
+ # Unload the LoRA adapter if it is used
428
+ model.unload()
429
+
426
430
  return ModelOutput(
427
431
  output_strs=output_strs,
428
432
  top_input_logprobs=top_input_logprobs,
sglang/test/send_one.py CHANGED
@@ -6,11 +6,56 @@ python3 -m sglang.test.send_one
6
6
  """
7
7
 
8
8
  import argparse
9
+ import dataclasses
9
10
  import json
10
11
 
11
12
  import requests
12
13
 
13
14
 
15
+ @dataclasses.dataclass
16
+ class BenchArgs:
17
+ host: str = "localhost"
18
+ port: int = 30000
19
+ batch_size: int = 1
20
+ temperature: float = 0.0
21
+ max_new_tokens: int = 512
22
+ frequency_penalty: float = 0.0
23
+ presence_penalty: float = 0.0
24
+ json: bool = False
25
+ return_logprob: bool = False
26
+ prompt: str = (
27
+ "Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:"
28
+ )
29
+ image: bool = False
30
+ stream: bool = False
31
+
32
+ @staticmethod
33
+ def add_cli_args(parser: argparse.ArgumentParser):
34
+ parser.add_argument("--host", type=str, default=BenchArgs.host)
35
+ parser.add_argument("--port", type=int, default=BenchArgs.port)
36
+ parser.add_argument("--batch-size", type=int, default=BenchArgs.batch_size)
37
+ parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
38
+ parser.add_argument(
39
+ "--max-new-tokens", type=int, default=BenchArgs.max_new_tokens
40
+ )
41
+ parser.add_argument(
42
+ "--frequency-penalty", type=float, default=BenchArgs.frequency_penalty
43
+ )
44
+ parser.add_argument(
45
+ "--presence-penalty", type=float, default=BenchArgs.presence_penalty
46
+ )
47
+ parser.add_argument("--json", action="store_true")
48
+ parser.add_argument("--return-logprob", action="store_true")
49
+ parser.add_argument("--prompt", type=str, default=BenchArgs.prompt)
50
+ parser.add_argument("--image", action="store_true")
51
+ parser.add_argument("--stream", action="store_true")
52
+
53
+ @classmethod
54
+ def from_cli_args(cls, args: argparse.Namespace):
55
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
56
+ return cls(**{attr: getattr(args, attr) for attr in attrs})
57
+
58
+
14
59
  def send_one_prompt(args):
15
60
  if args.image:
16
61
  args.prompt = (
@@ -20,20 +65,42 @@ def send_one_prompt(args):
20
65
  else:
21
66
  image_data = None
22
67
 
23
- response = requests.post(
24
- "http://localhost:30000/generate",
25
- json={
26
- "text": args.prompt,
27
- "image_data": image_data,
28
- "sampling_params": {
29
- "temperature": args.temperature,
30
- "max_new_tokens": args.max_new_tokens,
31
- "frequency_penalty": args.frequency_penalty,
32
- "presence_penalty": args.presence_penalty,
33
- },
34
- "return_logprob": args.return_logprob,
35
- "stream": args.stream,
68
+ prompt = args.prompt
69
+
70
+ if args.json:
71
+ prompt = (
72
+ "Human: What is the capital of France and how is that city like. "
73
+ "Give me 3 trivial information about that city. "
74
+ "Write in a format of json.\nAssistant:"
75
+ )
76
+ json_schema = "$$ANY$$"
77
+ json_schema = (
78
+ '{"type": "object", "properties": {"population": {"type": "integer"}}}'
79
+ )
80
+ else:
81
+ json_schema = None
82
+
83
+ if args.batch_size > 1:
84
+ prompt = [prompt] * args.batch_size
85
+
86
+ json_data = {
87
+ "text": prompt,
88
+ "image_data": image_data,
89
+ "sampling_params": {
90
+ "temperature": args.temperature,
91
+ "max_new_tokens": args.max_new_tokens,
92
+ "frequency_penalty": args.frequency_penalty,
93
+ "presence_penalty": args.presence_penalty,
94
+ "json_schema": json_schema,
95
+ "stop": ["Question", "Assistant:", "<|separator|>", "<|eos|>"],
36
96
  },
97
+ "return_logprob": args.return_logprob,
98
+ "stream": args.stream,
99
+ }
100
+
101
+ response = requests.post(
102
+ f"http://{args.host}:{args.port}/generate",
103
+ json=json_data,
37
104
  stream=args.stream,
38
105
  )
39
106
 
@@ -47,6 +114,9 @@ def send_one_prompt(args):
47
114
  else:
48
115
  ret = response.json()
49
116
 
117
+ if args.batch_size > 1:
118
+ ret = ret[0]
119
+
50
120
  latency = ret["meta_info"]["e2e_latency"]
51
121
 
52
122
  if "spec_verify_ct" in ret["meta_info"]:
@@ -68,21 +138,7 @@ def send_one_prompt(args):
68
138
 
69
139
  if __name__ == "__main__":
70
140
  parser = argparse.ArgumentParser()
71
- parser.add_argument("--temperature", type=float, default=0.0)
72
- parser.add_argument("--max-new-tokens", type=int, default=512)
73
- parser.add_argument("--frequency-penalty", type=float, default=0.0)
74
- parser.add_argument("--presence-penalty", type=float, default=0.0)
75
- parser.add_argument("--return-logprob", action="store_true")
76
- parser.add_argument(
77
- "--prompt",
78
- type=str,
79
- default="Human: Give me a fully functional FastAPI server. Show the python code.\n\nAssistant:",
80
- )
81
- parser.add_argument(
82
- "--image",
83
- action="store_true",
84
- )
85
- parser.add_argument("--stream", action="store_true")
141
+ BenchArgs.add_cli_args(parser)
86
142
  args = parser.parse_args()
87
143
 
88
144
  send_one_prompt(args)
sglang/test/test_utils.py CHANGED
@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
69
69
  DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
70
70
  "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
71
71
  )
72
+ DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
72
73
 
73
74
  # Nightly tests
74
75
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
@@ -732,6 +733,72 @@ def run_bench_one_batch(model, other_args):
732
733
  return output_throughput
733
734
 
734
735
 
736
+ def run_bench_offline_throughput(model, other_args):
737
+ command = [
738
+ "python3",
739
+ "-m",
740
+ "sglang.bench_offline_throughput",
741
+ "--num-prompts",
742
+ "1",
743
+ "--dataset-name",
744
+ "random",
745
+ "--random-input-len",
746
+ "256",
747
+ "--random-output-len",
748
+ "256",
749
+ "--model-path",
750
+ model,
751
+ *[str(x) for x in other_args],
752
+ ]
753
+
754
+ print(f"{command=}")
755
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
756
+
757
+ try:
758
+ stdout, stderr = process.communicate()
759
+ output = stdout.decode()
760
+ error = stderr.decode()
761
+ print(f"Output: {output}", flush=True)
762
+ print(f"Error: {error}", flush=True)
763
+
764
+ output_throughput = -1
765
+ for line in output.split("\n"):
766
+ if "Last generation throughput (tok/s):" in line:
767
+ output_throughput = float(line.split(":")[-1])
768
+ finally:
769
+ kill_process_tree(process.pid)
770
+
771
+ return output_throughput
772
+
773
+
774
+ def run_bench_one_batch_server(
775
+ model,
776
+ base_url,
777
+ server_args,
778
+ bench_args,
779
+ other_server_args,
780
+ simulate_spec_acc_lens=None,
781
+ ):
782
+ from sglang.bench_one_batch_server import run_benchmark
783
+
784
+ if simulate_spec_acc_lens is not None:
785
+ env = {**os.environ, "SIMULATE_ACC_LEN": str(simulate_spec_acc_lens)}
786
+ else:
787
+ env = None
788
+
789
+ process = popen_launch_server(
790
+ model,
791
+ base_url,
792
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
793
+ other_args=other_server_args,
794
+ env=env,
795
+ )
796
+ try:
797
+ run_benchmark(server_args=server_args, bench_args=bench_args)
798
+ finally:
799
+ kill_process_tree(process.pid)
800
+
801
+
735
802
  def lcs(X, Y):
736
803
  m = len(X)
737
804
  n = len(Y)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.6"
1
+ __version__ = "0.4.6.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.6
3
+ Version: 0.4.6.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -238,15 +238,16 @@ Requires-Dist: pynvml; extra == "runtime-common"
238
238
  Requires-Dist: python-multipart; extra == "runtime-common"
239
239
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
240
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
- Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
241
+ Requires-Dist: torchao>=0.9.0; extra == "runtime-common"
242
242
  Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
246
+ Requires-Dist: blobfile==3.0.0; extra == "runtime-common"
246
247
  Provides-Extra: srt
247
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
248
- Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
249
- Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.1.1; extra == "srt"
250
+ Requires-Dist: flashinfer_python==0.2.5; extra == "srt"
250
251
  Requires-Dist: torch==2.6.0; extra == "srt"
251
252
  Requires-Dist: torchvision==0.21.0; extra == "srt"
252
253
  Requires-Dist: cuda-python; extra == "srt"