sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. sglang/bench_serving.py +72 -10
  2. sglang/srt/_custom_ops.py +59 -92
  3. sglang/srt/configs/deepseekvl2.py +10 -1
  4. sglang/srt/configs/model_config.py +6 -16
  5. sglang/srt/constrained/base_grammar_backend.py +5 -1
  6. sglang/srt/custom_op.py +5 -0
  7. sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
  8. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  9. sglang/srt/distributed/parallel_state.py +32 -5
  10. sglang/srt/entrypoints/engine.py +0 -5
  11. sglang/srt/entrypoints/http_server.py +7 -1
  12. sglang/srt/entrypoints/verl_engine.py +2 -0
  13. sglang/srt/function_call_parser.py +0 -1
  14. sglang/srt/layers/attention/flashattention_backend.py +582 -125
  15. sglang/srt/layers/attention/flashinfer_backend.py +5 -7
  16. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
  17. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  18. sglang/srt/layers/dp_attention.py +12 -1
  19. sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
  20. sglang/srt/layers/moe/ep_moe/layer.py +79 -80
  21. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
  26. sglang/srt/layers/moe/topk.py +79 -6
  27. sglang/srt/layers/quantization/__init__.py +137 -165
  28. sglang/srt/layers/quantization/awq.py +200 -0
  29. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
  30. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
  31. sglang/srt/layers/quantization/fp8_kernel.py +2 -1
  32. sglang/srt/layers/quantization/fp8_utils.py +1 -4
  33. sglang/srt/layers/quantization/gptq.py +30 -40
  34. sglang/srt/layers/quantization/moe_wna16.py +501 -0
  35. sglang/srt/layers/quantization/utils.py +1 -1
  36. sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
  37. sglang/srt/lora/backend/base_backend.py +4 -4
  38. sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  39. sglang/srt/lora/backend/triton_backend.py +5 -8
  40. sglang/srt/lora/layers.py +19 -33
  41. sglang/srt/lora/lora_manager.py +20 -7
  42. sglang/srt/lora/mem_pool.py +12 -6
  43. sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  44. sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  45. sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  46. sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  47. sglang/srt/lora/utils.py +6 -0
  48. sglang/srt/managers/cache_controller.py +34 -11
  49. sglang/srt/managers/io_struct.py +4 -2
  50. sglang/srt/managers/mm_utils.py +202 -156
  51. sglang/srt/managers/multimodal_processor.py +0 -2
  52. sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
  53. sglang/srt/managers/multimodal_processors/clip.py +44 -0
  54. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
  55. sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
  56. sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
  57. sglang/srt/managers/multimodal_processors/llava.py +34 -14
  58. sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
  59. sglang/srt/managers/multimodal_processors/mlama.py +10 -23
  60. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
  61. sglang/srt/managers/schedule_batch.py +185 -127
  62. sglang/srt/managers/scheduler.py +29 -23
  63. sglang/srt/managers/tokenizer_manager.py +1 -2
  64. sglang/srt/managers/tp_worker.py +3 -0
  65. sglang/srt/managers/utils.py +1 -6
  66. sglang/srt/mem_cache/hiradix_cache.py +62 -52
  67. sglang/srt/mem_cache/memory_pool.py +72 -6
  68. sglang/srt/mem_cache/paged_allocator.py +39 -0
  69. sglang/srt/metrics/collector.py +23 -53
  70. sglang/srt/model_executor/cuda_graph_runner.py +16 -13
  71. sglang/srt/model_executor/forward_batch_info.py +10 -10
  72. sglang/srt/model_executor/model_runner.py +64 -59
  73. sglang/srt/model_loader/loader.py +19 -1
  74. sglang/srt/model_loader/weight_utils.py +6 -3
  75. sglang/srt/models/clip.py +568 -0
  76. sglang/srt/models/deepseek_janus_pro.py +12 -17
  77. sglang/srt/models/deepseek_v2.py +339 -123
  78. sglang/srt/models/deepseek_vl2.py +105 -104
  79. sglang/srt/models/gemma3_causal.py +12 -2
  80. sglang/srt/models/gemma3_mm.py +20 -80
  81. sglang/srt/models/llama.py +4 -1
  82. sglang/srt/models/llava.py +31 -19
  83. sglang/srt/models/llavavid.py +16 -7
  84. sglang/srt/models/minicpmo.py +63 -147
  85. sglang/srt/models/minicpmv.py +17 -27
  86. sglang/srt/models/mllama.py +29 -14
  87. sglang/srt/models/qwen2.py +9 -6
  88. sglang/srt/models/qwen2_5_vl.py +21 -31
  89. sglang/srt/models/qwen2_vl.py +20 -21
  90. sglang/srt/openai_api/adapter.py +106 -93
  91. sglang/srt/openai_api/protocol.py +10 -5
  92. sglang/srt/patch_torch.py +71 -0
  93. sglang/srt/platforms/interface.py +371 -0
  94. sglang/srt/server_args.py +120 -25
  95. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
  96. sglang/srt/speculative/eagle_utils.py +140 -28
  97. sglang/srt/speculative/eagle_worker.py +94 -25
  98. sglang/srt/utils.py +137 -51
  99. sglang/test/runners.py +27 -2
  100. sglang/test/test_custom_ops.py +55 -0
  101. sglang/test/test_utils.py +14 -27
  102. sglang/utils.py +2 -2
  103. sglang/version.py +1 -1
  104. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
  105. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
  106. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
  107. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
  108. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -44,6 +44,12 @@ ASSISTANT_SUFFIX = "Assistant:"
44
44
  global args
45
45
 
46
46
 
47
+ # don't want to import sglang package here
48
+ def _get_bool_env_var(name: str, default: str = "false") -> bool:
49
+ value = os.getenv(name, default)
50
+ return value.lower() in ("true", "1")
51
+
52
+
47
53
  @dataclass
48
54
  class RequestFuncInput:
49
55
  prompt: str
@@ -965,10 +971,11 @@ async def benchmark(
965
971
  request_rate: float,
966
972
  max_concurrency: Optional[int],
967
973
  disable_tqdm: bool,
968
- lora_name: str,
974
+ lora_names: List[str],
969
975
  extra_request_body: Dict[str, Any],
970
976
  profile: bool,
971
977
  pd_seperated: bool = False,
978
+ flush_cache: bool = False,
972
979
  ):
973
980
  if backend in ASYNC_REQUEST_FUNCS:
974
981
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -986,8 +993,16 @@ async def benchmark(
986
993
  return await request_func(request_func_input=request_func_input, pbar=pbar)
987
994
 
988
995
  # Warmup
989
- print("Starting initial single prompt test run...")
996
+ print(f"Starting warmup with {args.warmup_requests} sequences...")
997
+
998
+ # Use the first request for all warmup iterations
990
999
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
1000
+ if lora_names != None and len(lora_names) != 0:
1001
+ lora_name = lora_names[0]
1002
+ else:
1003
+ lora_name = None
1004
+
1005
+ # Create the test input once
991
1006
  test_input = RequestFuncInput(
992
1007
  model=model_id,
993
1008
  prompt=test_prompt,
@@ -997,17 +1012,29 @@ async def benchmark(
997
1012
  lora_name=lora_name,
998
1013
  extra_request_body=extra_request_body,
999
1014
  )
1000
- test_output = await request_func(request_func_input=test_input)
1001
- if not test_output.success:
1015
+
1016
+ # Run warmup requests
1017
+ warmup_tasks = []
1018
+ for _ in range(args.warmup_requests):
1019
+ warmup_tasks.append(
1020
+ asyncio.create_task(request_func(request_func_input=test_input))
1021
+ )
1022
+
1023
+ warmup_outputs = await asyncio.gather(*warmup_tasks)
1024
+
1025
+ # Check if at least one warmup request succeeded
1026
+ if not any(output.success for output in warmup_outputs):
1002
1027
  raise ValueError(
1003
- "Initial test run failed - Please make sure benchmark arguments "
1004
- f"are correctly specified. Error: {test_output.error}"
1028
+ "Warmup failed - Please make sure benchmark arguments "
1029
+ f"are correctly specified. Error: {warmup_outputs[0].error}"
1005
1030
  )
1006
1031
  else:
1007
- print("Initial test run completed. Starting main benchmark run...")
1032
+ print(
1033
+ f"Warmup completed with {args.warmup_requests} sequences. Starting main benchmark run..."
1034
+ )
1008
1035
 
1009
1036
  # Flush cache
1010
- if "sglang" in backend:
1037
+ if ("sglang" in backend and _get_bool_env_var("SGLANG_IS_IN_CI")) or flush_cache:
1011
1038
  requests.post(base_url + "/flush_cache", headers=get_auth_headers())
1012
1039
 
1013
1040
  time.sleep(1.0)
@@ -1028,6 +1055,12 @@ async def benchmark(
1028
1055
  tasks: List[asyncio.Task] = []
1029
1056
  async for request in get_request(input_requests, request_rate):
1030
1057
  prompt, prompt_len, output_len = request
1058
+ if lora_names != None and len(lora_names) != 0:
1059
+ idx = random.randint(0, len(lora_names) - 1)
1060
+ lora_name = lora_names[idx]
1061
+ else:
1062
+ lora_name = None
1063
+
1031
1064
  request_func_input = RequestFuncInput(
1032
1065
  model=model_id,
1033
1066
  prompt=prompt,
@@ -1235,6 +1268,10 @@ def run_benchmark(args_: argparse.Namespace):
1235
1268
  if not hasattr(args, "max_concurrency"):
1236
1269
  args.max_concurrency = None
1237
1270
 
1271
+ # Set default value for warmup_requests if not present
1272
+ if not hasattr(args, "warmup_requests"):
1273
+ args.warmup_requests = 1
1274
+
1238
1275
  print(f"benchmark_args={args}")
1239
1276
 
1240
1277
  # Set global environments
@@ -1336,6 +1373,10 @@ def run_benchmark(args_: argparse.Namespace):
1336
1373
  tokenizer = get_tokenizer(tokenizer_id)
1337
1374
  input_requests = get_dataset(args, tokenizer)
1338
1375
 
1376
+ # compatible with SimpleNamespace
1377
+ if not hasattr(args, "flush_cache"):
1378
+ args.flush_cache = False
1379
+
1339
1380
  return asyncio.run(
1340
1381
  benchmark(
1341
1382
  backend=backend,
@@ -1347,10 +1388,11 @@ def run_benchmark(args_: argparse.Namespace):
1347
1388
  request_rate=args.request_rate,
1348
1389
  max_concurrency=args.max_concurrency,
1349
1390
  disable_tqdm=args.disable_tqdm,
1350
- lora_name=args.lora_name,
1391
+ lora_names=args.lora_name,
1351
1392
  extra_request_body=extra_request_body,
1352
1393
  profile=args.profile,
1353
1394
  pd_seperated=args.pd_seperated,
1395
+ flush_cache=args.flush_cache,
1354
1396
  )
1355
1397
  )
1356
1398
 
@@ -1366,6 +1408,13 @@ def set_ulimit(target_soft_limit=65535):
1366
1408
  print(f"Fail to set RLIMIT_NOFILE: {e}")
1367
1409
 
1368
1410
 
1411
+ class LoRAPathAction(argparse.Action):
1412
+ def __call__(self, parser, namespace, values, option_string=None):
1413
+ setattr(namespace, self.dest, [])
1414
+ for lora_name in values:
1415
+ getattr(namespace, self.dest).append(lora_name)
1416
+
1417
+
1369
1418
  if __name__ == "__main__":
1370
1419
  parser = ArgumentParser(description="Benchmark the online serving throughput.")
1371
1420
  parser.add_argument(
@@ -1509,8 +1558,10 @@ if __name__ == "__main__":
1509
1558
  parser.add_argument(
1510
1559
  "--lora-name",
1511
1560
  type=str,
1561
+ nargs="*",
1512
1562
  default=None,
1513
- help="The name of LoRA adapter",
1563
+ action=LoRAPathAction,
1564
+ help="The names of LoRA adapters. You can provide a list of names in the format {name} {name} {name}...",
1514
1565
  )
1515
1566
  parser.add_argument(
1516
1567
  "--prompt-suffix",
@@ -1523,6 +1574,17 @@ if __name__ == "__main__":
1523
1574
  action="store_true",
1524
1575
  help="Benchmark PD disaggregation server",
1525
1576
  )
1577
+ parser.add_argument(
1578
+ "--flush-cache",
1579
+ action="store_true",
1580
+ help="Flush the cache before running the benchmark",
1581
+ )
1582
+ parser.add_argument(
1583
+ "--warmup-requests",
1584
+ type=int,
1585
+ default=1,
1586
+ help="Number of warmup requests to run before the benchmark",
1587
+ )
1526
1588
 
1527
1589
  group = parser.add_argument_group("generated-shared-prefix dataset arguments")
1528
1590
  group.add_argument(
sglang/srt/_custom_ops.py CHANGED
@@ -27,17 +27,20 @@ if not is_hpu():
27
27
  logger.warning("Failed to import from custom_ar with %r", e)
28
28
 
29
29
 
30
- if use_vllm_custom_allreduce and not is_hip():
31
- # vLLM custom allreduce
30
+ if not is_hip():
31
+ if use_vllm_custom_allreduce:
32
+ custom_op = torch.ops._C_custom_ar
33
+ else:
34
+ custom_op = sgl_kernel.allreduce
35
+
36
+ # custom allreduce
32
37
  def init_custom_ar(
33
38
  ipc_tensors: List[torch.Tensor],
34
39
  rank_data: torch.Tensor,
35
40
  rank: int,
36
41
  full_nvlink: bool,
37
42
  ) -> int:
38
- return torch.ops._C_custom_ar.init_custom_ar(
39
- ipc_tensors, rank_data, rank, full_nvlink
40
- )
43
+ return custom_op.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink)
41
44
 
42
45
  def all_reduce(
43
46
  fa: int,
@@ -46,105 +49,69 @@ if use_vllm_custom_allreduce and not is_hip():
46
49
  reg_buffer: int,
47
50
  reg_buffer_sz_bytes: int,
48
51
  ) -> None:
49
- torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
52
+ custom_op.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
50
53
 
51
54
  def dispose(fa: int) -> None:
52
- torch.ops._C_custom_ar.dispose(fa)
55
+ custom_op.dispose(fa)
53
56
 
54
57
  def meta_size() -> int:
55
- return torch.ops._C_custom_ar.meta_size()
58
+ return custom_op.meta_size()
56
59
 
57
60
  def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
58
- return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
61
+ return custom_op.register_buffer(fa, ipc_tensors)
59
62
 
60
63
  def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
61
- return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
64
+ return custom_op.get_graph_buffer_ipc_meta(fa)
62
65
 
63
66
  def register_graph_buffers(
64
67
  fa: int, handles: List[List[int]], offsets: List[List[int]]
65
68
  ) -> None:
66
- torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
69
+ custom_op.register_graph_buffers(fa, handles, offsets)
67
70
 
68
71
  else:
69
- if is_hip():
70
- # ROCM custom allreduce
71
-
72
- def init_custom_ar(
73
- meta: torch.Tensor,
74
- rank_data: torch.Tensor,
75
- handles: List[str],
76
- offsets: List[int],
77
- rank: int,
78
- full_nvlink: bool,
79
- ) -> int:
80
- return sgl_kernel.allreduce.init_custom_ar(
81
- meta, rank_data, handles, offsets, rank, full_nvlink
82
- )
83
-
84
- def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
85
- sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
86
-
87
- def all_reduce_unreg(
88
- fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
89
- ) -> None:
90
- sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
91
-
92
- def dispose(fa: int) -> None:
93
- sgl_kernel.allreduce.dispose(fa)
94
-
95
- def meta_size() -> int:
96
- return sgl_kernel.allreduce.meta_size()
97
-
98
- def register_buffer(
99
- fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
100
- ) -> None:
101
- return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
102
-
103
- def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
104
- return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
105
-
106
- def register_graph_buffers(
107
- fa: int, handles: List[str], offsets: List[List[int]]
108
- ) -> None:
109
- sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
110
-
111
- def allocate_meta_buffer(size: int) -> torch.Tensor:
112
- return sgl_kernel.allreduce.allocate_meta_buffer(size)
113
-
114
- def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
115
- return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
72
+ # ROCM custom allreduce
116
73
 
117
- else:
118
- # TRTLLM custom allreduce
119
- def init_custom_ar(
120
- rank_id: int,
121
- world_size: int,
122
- rank_data_base: torch.Tensor,
123
- buffers: List[int],
124
- tmp_result_buffers: List[int],
125
- barrier_in: List[int],
126
- barrier_out: List[int],
127
- ) -> int:
128
- return sgl_kernel.init_custom_reduce(
129
- rank_id,
130
- world_size,
131
- rank_data_base,
132
- buffers,
133
- tmp_result_buffers,
134
- barrier_in,
135
- barrier_out,
136
- )
137
-
138
- def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
139
- sgl_kernel.custom_reduce(fa, inp, out)
140
-
141
- def dispose(fa: int) -> None:
142
- sgl_kernel.custom_dispose(fa)
143
-
144
- def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
145
- return sgl_kernel.get_graph_buffer_ipc_meta(fa)
146
-
147
- def register_graph_buffers(
148
- fa: int, handles: List[List[int]], offsets: List[List[int]]
149
- ) -> None:
150
- sgl_kernel.register_graph_buffers(fa, handles, offsets)
74
+ def init_custom_ar(
75
+ meta: torch.Tensor,
76
+ rank_data: torch.Tensor,
77
+ handles: List[str],
78
+ offsets: List[int],
79
+ rank: int,
80
+ full_nvlink: bool,
81
+ ) -> int:
82
+ return sgl_kernel.allreduce.init_custom_ar(
83
+ meta, rank_data, handles, offsets, rank, full_nvlink
84
+ )
85
+
86
+ def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
87
+ sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
88
+
89
+ def all_reduce_unreg(
90
+ fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
91
+ ) -> None:
92
+ sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
93
+
94
+ def dispose(fa: int) -> None:
95
+ sgl_kernel.allreduce.dispose(fa)
96
+
97
+ def meta_size() -> int:
98
+ return sgl_kernel.allreduce.meta_size()
99
+
100
+ def register_buffer(
101
+ fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
102
+ ) -> None:
103
+ return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
104
+
105
+ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
106
+ return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
107
+
108
+ def register_graph_buffers(
109
+ fa: int, handles: List[str], offsets: List[List[int]]
110
+ ) -> None:
111
+ sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
112
+
113
+ def allocate_meta_buffer(size: int) -> torch.Tensor:
114
+ return sgl_kernel.allreduce.allocate_meta_buffer(size)
115
+
116
+ def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
117
+ return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
@@ -4,7 +4,6 @@ from dataclasses import dataclass
4
4
  from typing import Dict, List, Optional, Tuple
5
5
 
6
6
  import torch
7
- import torchvision.transforms as T
8
7
  from PIL import Image, ImageOps
9
8
  from transformers import (
10
9
  AutoProcessor,
@@ -76,6 +75,16 @@ class ImageTransform(object):
76
75
  self.std = std
77
76
  self.normalize = normalize
78
77
 
78
+ # only load torchvision.transforms when needed
79
+ try:
80
+ import torchvision.transforms as T
81
+
82
+ # FIXME: add version check for gguf
83
+ except ImportError as err:
84
+ raise ImportError(
85
+ "Please install torchvision via `pip install torchvision` to use Deepseek-VL2."
86
+ ) from err
87
+
79
88
  transform_pipelines = [T.ToTensor()]
80
89
 
81
90
  if normalize:
@@ -22,11 +22,7 @@ import torch
22
22
  from transformers import PretrainedConfig
23
23
 
24
24
  from sglang.srt.hf_transformers_utils import get_config, get_context_length
25
- from sglang.srt.layers.quantization import (
26
- BASE_QUANTIZATION_METHODS,
27
- QUANTIZATION_METHODS,
28
- VLLM_AVAILABLE,
29
- )
25
+ from sglang.srt.layers.quantization import QUANTIZATION_METHODS
30
26
  from sglang.srt.utils import get_bool_env_var, is_hip
31
27
 
32
28
  logger = logging.getLogger(__name__)
@@ -239,12 +235,7 @@ class ModelConfig:
239
235
 
240
236
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
241
237
  def _verify_quantization(self) -> None:
242
- # Select supported quantization methods based on vllm availability
243
- if VLLM_AVAILABLE:
244
- supported_quantization = [*QUANTIZATION_METHODS]
245
- else:
246
- supported_quantization = [*BASE_QUANTIZATION_METHODS]
247
-
238
+ supported_quantization = [*QUANTIZATION_METHODS]
248
239
  rocm_supported_quantization = [
249
240
  "awq",
250
241
  "gptq",
@@ -267,6 +258,7 @@ class ModelConfig:
267
258
  "experts_int8",
268
259
  "w8a8_int8",
269
260
  "w8a8_fp8",
261
+ "moe_wna16",
270
262
  ]
271
263
  compatible_quantization_methods = {
272
264
  "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
@@ -282,11 +274,7 @@ class ModelConfig:
282
274
  quant_method = quant_cfg.get("quant_method", "").lower()
283
275
 
284
276
  # Detect which checkpoint is it
285
- # Only iterate through currently available quantization methods
286
- available_methods = (
287
- QUANTIZATION_METHODS if VLLM_AVAILABLE else BASE_QUANTIZATION_METHODS
288
- )
289
- for _, method in available_methods.items():
277
+ for _, method in QUANTIZATION_METHODS.items():
290
278
  quantization_override = method.override_quantization_method(
291
279
  quant_cfg, self.quantization
292
280
  )
@@ -467,6 +455,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
467
455
  or "InternLM2ForRewardModel" in model_architectures
468
456
  or "Qwen2ForRewardModel" in model_architectures
469
457
  or "Qwen2ForSequenceClassification" in model_architectures
458
+ or "CLIPModel" in model_architectures
470
459
  ):
471
460
  return False
472
461
  else:
@@ -488,6 +477,7 @@ multimodal_model_archs = [
488
477
  "MllamaForConditionalGeneration",
489
478
  "Qwen2VLForConditionalGeneration",
490
479
  "Qwen2_5_VLForConditionalGeneration",
480
+ "CLIPModel",
491
481
  ]
492
482
 
493
483
 
@@ -169,7 +169,9 @@ class BaseGrammarBackend(ABC):
169
169
  self.cache.clear()
170
170
 
171
171
 
172
- def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
172
+ def create_grammar_backend(
173
+ server_args: ServerArgs, tokenizer, vocab_size: int
174
+ ) -> Optional[BaseGrammarBackend]:
173
175
  if server_args.grammar_backend == "outlines":
174
176
  from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
175
177
 
@@ -188,6 +190,8 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
188
190
  tokenizer=tokenizer,
189
191
  whitespace_pattern=server_args.constrained_json_whitespace_pattern,
190
192
  )
193
+ elif server_args.grammar_backend == "none":
194
+ return None
191
195
  else:
192
196
  raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
193
197
 
sglang/srt/custom_op.py CHANGED
@@ -50,6 +50,7 @@ if _is_cuda:
50
50
  def scaled_fp8_quant(
51
51
  input: torch.Tensor,
52
52
  scale: Optional[torch.Tensor] = None,
53
+ num_token_padding: Optional[int] = None,
53
54
  use_per_token_if_dynamic: bool = False,
54
55
  ) -> tuple[torch.Tensor, torch.Tensor]:
55
56
  """
@@ -59,6 +60,8 @@ if _is_cuda:
59
60
  input (torch.Tensor): Input tensor to be quantized
60
61
  scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
61
62
  If None, scales will be computed dynamically.
63
+ num_token_padding (Optional[int]): If specified, pad the first dimension
64
+ of the output to at least this value.
62
65
  use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
63
66
  determines the quantization granularity:
64
67
  - True: compute scale per token
@@ -75,6 +78,8 @@ if _is_cuda:
75
78
  assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
76
79
  shape = input.shape
77
80
  out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
81
+ if num_token_padding:
82
+ shape = (max(num_token_padding, input.shape[0]), shape[1])
78
83
  output = torch.empty(shape, device=input.device, dtype=out_dtype)
79
84
 
80
85
  if scale is None:
@@ -5,7 +5,7 @@ import logging
5
5
  import os
6
6
  from contextlib import contextmanager
7
7
  from functools import wraps
8
- from typing import Callable, List, Optional, TypeVar, Union
8
+ from typing import Any, Callable, List, Optional, TypeVar, Union
9
9
 
10
10
  import torch
11
11
  import torch.distributed as dist
@@ -18,7 +18,7 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import
18
18
  gpu_p2p_access_check,
19
19
  )
20
20
  from sglang.srt.distributed.parallel_state import in_the_same_node_as
21
- from sglang.srt.utils import cuda_device_count_stateless, is_cuda, is_hip
21
+ from sglang.srt.utils import is_cuda, is_hip
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
@@ -217,7 +217,7 @@ class CustomAllreduce:
217
217
  if cuda_visible_devices:
218
218
  device_ids = list(map(int, cuda_visible_devices.split(",")))
219
219
  else:
220
- device_ids = list(range(cuda_device_count_stateless()))
220
+ device_ids = list(range(torch.cuda.device_count()))
221
221
 
222
222
  physical_device_id = device_ids[device.index]
223
223
  tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
@@ -257,7 +257,7 @@ class CustomAllreduce:
257
257
  self.world_size = world_size
258
258
  self.full_nvlink = full_nvlink
259
259
 
260
- if ops.use_vllm_custom_allreduce and not _is_hip:
260
+ if not _is_hip:
261
261
  # Buffers memory are owned by this Python class and passed to C++.
262
262
  # Meta data composes of two parts: meta data for synchronization and a
263
263
  # temporary buffer for storing intermediate allreduce results.
@@ -280,56 +280,24 @@ class CustomAllreduce:
280
280
  )
281
281
  ops.register_buffer(self._ptr, self.buffer_ptrs)
282
282
  else:
283
- if _is_hip:
284
- # meta data buffers need to be "uncached" for signal on MI200
285
- self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
286
- self.buffer = torch.empty(
287
- max_size, dtype=torch.uint8, device=self.device
288
- )
289
- handle = ops.get_meta_buffer_ipc_handle(self.meta)
290
- shard_data = (
291
- bytes(handle), # ipc handle to base ptr
292
- 0, # offset of base ptr
293
- )
294
- handles, offsets = self._gather_ipc_meta(shard_data)
295
- self.rank_data = torch.empty(
296
- 8 * 1024 * 1024, dtype=torch.uint8, device=self.device
297
- )
298
- self._ptr = ops.init_custom_ar(
299
- self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
300
- )
301
- self.register_buffer(self.buffer)
302
- self.MSCCL = os.getenv("RCCL_MSCCL_ENABLE", "1") == "1"
303
- else:
304
- # From TensorRT-LLM getMaxRequiredWorkspaceSize
305
- self.max_required_workspace_size = [16 * 1024 * 1024, 8 * 1024 * 1024]
306
-
307
- # sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
308
- self.barrier_max_size = 8 * (36 + 2) * 8
309
-
310
- self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
311
- self.tmp_result_buffer_ptrs = self.create_shared_buffer(
312
- max_size, group=group
313
- )
314
- self.rank_data_base = torch.empty(
315
- 8 * 1024 * 1024, dtype=torch.uint8, device=self.device
316
- )
317
- self.barrier_in_ptrs = self.create_shared_buffer(
318
- self.barrier_max_size, group=group
319
- )
320
- self.barrier_out_ptrs = self.create_shared_buffer(
321
- self.barrier_max_size, group=group
322
- )
283
+ # meta data buffers need to be "uncached" for signal on MI200
284
+ self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
285
+ self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
286
+ handle = ops.get_meta_buffer_ipc_handle(self.meta)
287
+ shard_data = (
288
+ bytes(handle), # ipc handle to base ptr
289
+ 0, # offset of base ptr
290
+ )
291
+ handles, offsets = self._gather_ipc_meta(shard_data)
292
+ self.rank_data = torch.empty(
293
+ 8 * 1024 * 1024, dtype=torch.uint8, device=self.device
294
+ )
295
+ self._ptr = ops.init_custom_ar(
296
+ self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
297
+ )
298
+ self.register_buffer(self.buffer)
299
+ self.MSCCL = os.getenv("RCCL_MSCCL_ENABLE", "1") == "1"
323
300
 
324
- self._ptr = ops.init_custom_ar(
325
- rank,
326
- world_size,
327
- self.rank_data_base,
328
- self.buffer_ptrs,
329
- self.tmp_result_buffer_ptrs,
330
- self.barrier_in_ptrs,
331
- self.barrier_out_ptrs,
332
- )
333
301
  self.disabled = False
334
302
 
335
303
  @staticmethod
@@ -455,7 +423,7 @@ class CustomAllreduce:
455
423
  return False
456
424
  # for 4 or more non NVLink-capable GPUs, custom allreduce provides
457
425
  # little performance improvement over NCCL.
458
- if ops.use_vllm_custom_allreduce and not _is_hip:
426
+ if not _is_hip:
459
427
  if self.world_size == 2 or self.full_nvlink:
460
428
  return inp_size < self.max_size
461
429
  return False
@@ -471,18 +439,6 @@ class CustomAllreduce:
471
439
  return inp_size < self.max_size
472
440
  return False
473
441
 
474
- if self.world_size == 2:
475
- return (
476
- inp_size < self.max_size
477
- and inp_size < self.max_required_workspace_size[0]
478
- )
479
-
480
- if self.full_nvlink:
481
- return (
482
- inp_size < self.max_size
483
- and inp_size < self.max_required_workspace_size[1]
484
- )
485
-
486
442
  return False
487
443
 
488
444
  # all reduce, assuming inp tensor is IPC registered with register_buffer,
@@ -515,15 +471,12 @@ class CustomAllreduce:
515
471
  """
516
472
  if out is None:
517
473
  out = torch.empty_like(inp)
518
- if ops.use_vllm_custom_allreduce:
519
- if registered:
520
- ops.all_reduce(self._ptr, inp, out, 0, 0)
521
- else:
522
- ops.all_reduce(
523
- self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
524
- )
474
+ if registered:
475
+ ops.all_reduce(self._ptr, inp, out, 0, 0)
525
476
  else:
526
- ops.all_reduce(self._ptr, inp, out)
477
+ ops.all_reduce(
478
+ self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
479
+ )
527
480
  return out
528
481
 
529
482
  def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
@@ -554,14 +507,9 @@ class CustomAllreduce:
554
507
  def close(self):
555
508
  if not self.disabled and self._ptr:
556
509
  ops.dispose(self._ptr)
557
- if ops.use_vllm_custom_allreduce:
510
+ if _is_cuda:
558
511
  self.free_shared_buffer(self.meta_ptrs)
559
512
  self.free_shared_buffer(self.buffer_ptrs)
560
- elif _is_cuda:
561
- self.free_shared_buffer(self.buffer_ptrs)
562
- self.free_shared_buffer(self.tmp_result_buffer_ptrs)
563
- self.free_shared_buffer(self.barrier_in_ptrs)
564
- self.free_shared_buffer(self.barrier_out_ptrs)
565
513
  self._ptr = 0
566
514
 
567
515
  def __del__(self):