sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +72 -10
- sglang/srt/_custom_ops.py +59 -92
- sglang/srt/configs/deepseekvl2.py +10 -1
- sglang/srt/configs/model_config.py +6 -16
- sglang/srt/constrained/base_grammar_backend.py +5 -1
- sglang/srt/custom_op.py +5 -0
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/parallel_state.py +32 -5
- sglang/srt/entrypoints/engine.py +0 -5
- sglang/srt/entrypoints/http_server.py +7 -1
- sglang/srt/entrypoints/verl_engine.py +2 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/attention/flashattention_backend.py +582 -125
- sglang/srt/layers/attention/flashinfer_backend.py +5 -7
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
- sglang/srt/layers/attention/flashmla_backend.py +1 -1
- sglang/srt/layers/dp_attention.py +12 -1
- sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
- sglang/srt/layers/moe/ep_moe/layer.py +79 -80
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
- sglang/srt/layers/moe/topk.py +79 -6
- sglang/srt/layers/quantization/__init__.py +137 -165
- sglang/srt/layers/quantization/awq.py +200 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
- sglang/srt/layers/quantization/fp8_kernel.py +2 -1
- sglang/srt/layers/quantization/fp8_utils.py +1 -4
- sglang/srt/layers/quantization/gptq.py +30 -40
- sglang/srt/layers/quantization/moe_wna16.py +501 -0
- sglang/srt/layers/quantization/utils.py +1 -1
- sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
- sglang/srt/lora/backend/base_backend.py +4 -4
- sglang/srt/lora/backend/flashinfer_backend.py +12 -9
- sglang/srt/lora/backend/triton_backend.py +5 -8
- sglang/srt/lora/layers.py +19 -33
- sglang/srt/lora/lora_manager.py +20 -7
- sglang/srt/lora/mem_pool.py +12 -6
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
- sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
- sglang/srt/lora/utils.py +6 -0
- sglang/srt/managers/cache_controller.py +34 -11
- sglang/srt/managers/io_struct.py +4 -2
- sglang/srt/managers/mm_utils.py +202 -156
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
- sglang/srt/managers/multimodal_processors/clip.py +44 -0
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
- sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
- sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
- sglang/srt/managers/multimodal_processors/llava.py +34 -14
- sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
- sglang/srt/managers/multimodal_processors/mlama.py +10 -23
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
- sglang/srt/managers/schedule_batch.py +185 -127
- sglang/srt/managers/scheduler.py +29 -23
- sglang/srt/managers/tokenizer_manager.py +1 -2
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/utils.py +1 -6
- sglang/srt/mem_cache/hiradix_cache.py +62 -52
- sglang/srt/mem_cache/memory_pool.py +72 -6
- sglang/srt/mem_cache/paged_allocator.py +39 -0
- sglang/srt/metrics/collector.py +23 -53
- sglang/srt/model_executor/cuda_graph_runner.py +16 -13
- sglang/srt/model_executor/forward_batch_info.py +10 -10
- sglang/srt/model_executor/model_runner.py +64 -59
- sglang/srt/model_loader/loader.py +19 -1
- sglang/srt/model_loader/weight_utils.py +6 -3
- sglang/srt/models/clip.py +568 -0
- sglang/srt/models/deepseek_janus_pro.py +12 -17
- sglang/srt/models/deepseek_v2.py +339 -123
- sglang/srt/models/deepseek_vl2.py +105 -104
- sglang/srt/models/gemma3_causal.py +12 -2
- sglang/srt/models/gemma3_mm.py +20 -80
- sglang/srt/models/llama.py +4 -1
- sglang/srt/models/llava.py +31 -19
- sglang/srt/models/llavavid.py +16 -7
- sglang/srt/models/minicpmo.py +63 -147
- sglang/srt/models/minicpmv.py +17 -27
- sglang/srt/models/mllama.py +29 -14
- sglang/srt/models/qwen2.py +9 -6
- sglang/srt/models/qwen2_5_vl.py +21 -31
- sglang/srt/models/qwen2_vl.py +20 -21
- sglang/srt/openai_api/adapter.py +106 -93
- sglang/srt/openai_api/protocol.py +10 -5
- sglang/srt/patch_torch.py +71 -0
- sglang/srt/platforms/interface.py +371 -0
- sglang/srt/server_args.py +120 -25
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
- sglang/srt/speculative/eagle_utils.py +140 -28
- sglang/srt/speculative/eagle_worker.py +94 -25
- sglang/srt/utils.py +137 -51
- sglang/test/runners.py +27 -2
- sglang/test/test_custom_ops.py +55 -0
- sglang/test/test_utils.py +14 -27
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
@@ -44,6 +44,12 @@ ASSISTANT_SUFFIX = "Assistant:"
|
|
44
44
|
global args
|
45
45
|
|
46
46
|
|
47
|
+
# don't want to import sglang package here
|
48
|
+
def _get_bool_env_var(name: str, default: str = "false") -> bool:
|
49
|
+
value = os.getenv(name, default)
|
50
|
+
return value.lower() in ("true", "1")
|
51
|
+
|
52
|
+
|
47
53
|
@dataclass
|
48
54
|
class RequestFuncInput:
|
49
55
|
prompt: str
|
@@ -965,10 +971,11 @@ async def benchmark(
|
|
965
971
|
request_rate: float,
|
966
972
|
max_concurrency: Optional[int],
|
967
973
|
disable_tqdm: bool,
|
968
|
-
|
974
|
+
lora_names: List[str],
|
969
975
|
extra_request_body: Dict[str, Any],
|
970
976
|
profile: bool,
|
971
977
|
pd_seperated: bool = False,
|
978
|
+
flush_cache: bool = False,
|
972
979
|
):
|
973
980
|
if backend in ASYNC_REQUEST_FUNCS:
|
974
981
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -986,8 +993,16 @@ async def benchmark(
|
|
986
993
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
987
994
|
|
988
995
|
# Warmup
|
989
|
-
print("Starting
|
996
|
+
print(f"Starting warmup with {args.warmup_requests} sequences...")
|
997
|
+
|
998
|
+
# Use the first request for all warmup iterations
|
990
999
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
1000
|
+
if lora_names != None and len(lora_names) != 0:
|
1001
|
+
lora_name = lora_names[0]
|
1002
|
+
else:
|
1003
|
+
lora_name = None
|
1004
|
+
|
1005
|
+
# Create the test input once
|
991
1006
|
test_input = RequestFuncInput(
|
992
1007
|
model=model_id,
|
993
1008
|
prompt=test_prompt,
|
@@ -997,17 +1012,29 @@ async def benchmark(
|
|
997
1012
|
lora_name=lora_name,
|
998
1013
|
extra_request_body=extra_request_body,
|
999
1014
|
)
|
1000
|
-
|
1001
|
-
|
1015
|
+
|
1016
|
+
# Run warmup requests
|
1017
|
+
warmup_tasks = []
|
1018
|
+
for _ in range(args.warmup_requests):
|
1019
|
+
warmup_tasks.append(
|
1020
|
+
asyncio.create_task(request_func(request_func_input=test_input))
|
1021
|
+
)
|
1022
|
+
|
1023
|
+
warmup_outputs = await asyncio.gather(*warmup_tasks)
|
1024
|
+
|
1025
|
+
# Check if at least one warmup request succeeded
|
1026
|
+
if not any(output.success for output in warmup_outputs):
|
1002
1027
|
raise ValueError(
|
1003
|
-
"
|
1004
|
-
f"are correctly specified. Error: {
|
1028
|
+
"Warmup failed - Please make sure benchmark arguments "
|
1029
|
+
f"are correctly specified. Error: {warmup_outputs[0].error}"
|
1005
1030
|
)
|
1006
1031
|
else:
|
1007
|
-
print(
|
1032
|
+
print(
|
1033
|
+
f"Warmup completed with {args.warmup_requests} sequences. Starting main benchmark run..."
|
1034
|
+
)
|
1008
1035
|
|
1009
1036
|
# Flush cache
|
1010
|
-
if "sglang" in backend:
|
1037
|
+
if ("sglang" in backend and _get_bool_env_var("SGLANG_IS_IN_CI")) or flush_cache:
|
1011
1038
|
requests.post(base_url + "/flush_cache", headers=get_auth_headers())
|
1012
1039
|
|
1013
1040
|
time.sleep(1.0)
|
@@ -1028,6 +1055,12 @@ async def benchmark(
|
|
1028
1055
|
tasks: List[asyncio.Task] = []
|
1029
1056
|
async for request in get_request(input_requests, request_rate):
|
1030
1057
|
prompt, prompt_len, output_len = request
|
1058
|
+
if lora_names != None and len(lora_names) != 0:
|
1059
|
+
idx = random.randint(0, len(lora_names) - 1)
|
1060
|
+
lora_name = lora_names[idx]
|
1061
|
+
else:
|
1062
|
+
lora_name = None
|
1063
|
+
|
1031
1064
|
request_func_input = RequestFuncInput(
|
1032
1065
|
model=model_id,
|
1033
1066
|
prompt=prompt,
|
@@ -1235,6 +1268,10 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1235
1268
|
if not hasattr(args, "max_concurrency"):
|
1236
1269
|
args.max_concurrency = None
|
1237
1270
|
|
1271
|
+
# Set default value for warmup_requests if not present
|
1272
|
+
if not hasattr(args, "warmup_requests"):
|
1273
|
+
args.warmup_requests = 1
|
1274
|
+
|
1238
1275
|
print(f"benchmark_args={args}")
|
1239
1276
|
|
1240
1277
|
# Set global environments
|
@@ -1336,6 +1373,10 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1336
1373
|
tokenizer = get_tokenizer(tokenizer_id)
|
1337
1374
|
input_requests = get_dataset(args, tokenizer)
|
1338
1375
|
|
1376
|
+
# compatible with SimpleNamespace
|
1377
|
+
if not hasattr(args, "flush_cache"):
|
1378
|
+
args.flush_cache = False
|
1379
|
+
|
1339
1380
|
return asyncio.run(
|
1340
1381
|
benchmark(
|
1341
1382
|
backend=backend,
|
@@ -1347,10 +1388,11 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1347
1388
|
request_rate=args.request_rate,
|
1348
1389
|
max_concurrency=args.max_concurrency,
|
1349
1390
|
disable_tqdm=args.disable_tqdm,
|
1350
|
-
|
1391
|
+
lora_names=args.lora_name,
|
1351
1392
|
extra_request_body=extra_request_body,
|
1352
1393
|
profile=args.profile,
|
1353
1394
|
pd_seperated=args.pd_seperated,
|
1395
|
+
flush_cache=args.flush_cache,
|
1354
1396
|
)
|
1355
1397
|
)
|
1356
1398
|
|
@@ -1366,6 +1408,13 @@ def set_ulimit(target_soft_limit=65535):
|
|
1366
1408
|
print(f"Fail to set RLIMIT_NOFILE: {e}")
|
1367
1409
|
|
1368
1410
|
|
1411
|
+
class LoRAPathAction(argparse.Action):
|
1412
|
+
def __call__(self, parser, namespace, values, option_string=None):
|
1413
|
+
setattr(namespace, self.dest, [])
|
1414
|
+
for lora_name in values:
|
1415
|
+
getattr(namespace, self.dest).append(lora_name)
|
1416
|
+
|
1417
|
+
|
1369
1418
|
if __name__ == "__main__":
|
1370
1419
|
parser = ArgumentParser(description="Benchmark the online serving throughput.")
|
1371
1420
|
parser.add_argument(
|
@@ -1509,8 +1558,10 @@ if __name__ == "__main__":
|
|
1509
1558
|
parser.add_argument(
|
1510
1559
|
"--lora-name",
|
1511
1560
|
type=str,
|
1561
|
+
nargs="*",
|
1512
1562
|
default=None,
|
1513
|
-
|
1563
|
+
action=LoRAPathAction,
|
1564
|
+
help="The names of LoRA adapters. You can provide a list of names in the format {name} {name} {name}...",
|
1514
1565
|
)
|
1515
1566
|
parser.add_argument(
|
1516
1567
|
"--prompt-suffix",
|
@@ -1523,6 +1574,17 @@ if __name__ == "__main__":
|
|
1523
1574
|
action="store_true",
|
1524
1575
|
help="Benchmark PD disaggregation server",
|
1525
1576
|
)
|
1577
|
+
parser.add_argument(
|
1578
|
+
"--flush-cache",
|
1579
|
+
action="store_true",
|
1580
|
+
help="Flush the cache before running the benchmark",
|
1581
|
+
)
|
1582
|
+
parser.add_argument(
|
1583
|
+
"--warmup-requests",
|
1584
|
+
type=int,
|
1585
|
+
default=1,
|
1586
|
+
help="Number of warmup requests to run before the benchmark",
|
1587
|
+
)
|
1526
1588
|
|
1527
1589
|
group = parser.add_argument_group("generated-shared-prefix dataset arguments")
|
1528
1590
|
group.add_argument(
|
sglang/srt/_custom_ops.py
CHANGED
@@ -27,17 +27,20 @@ if not is_hpu():
|
|
27
27
|
logger.warning("Failed to import from custom_ar with %r", e)
|
28
28
|
|
29
29
|
|
30
|
-
if
|
31
|
-
|
30
|
+
if not is_hip():
|
31
|
+
if use_vllm_custom_allreduce:
|
32
|
+
custom_op = torch.ops._C_custom_ar
|
33
|
+
else:
|
34
|
+
custom_op = sgl_kernel.allreduce
|
35
|
+
|
36
|
+
# custom allreduce
|
32
37
|
def init_custom_ar(
|
33
38
|
ipc_tensors: List[torch.Tensor],
|
34
39
|
rank_data: torch.Tensor,
|
35
40
|
rank: int,
|
36
41
|
full_nvlink: bool,
|
37
42
|
) -> int:
|
38
|
-
return
|
39
|
-
ipc_tensors, rank_data, rank, full_nvlink
|
40
|
-
)
|
43
|
+
return custom_op.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink)
|
41
44
|
|
42
45
|
def all_reduce(
|
43
46
|
fa: int,
|
@@ -46,105 +49,69 @@ if use_vllm_custom_allreduce and not is_hip():
|
|
46
49
|
reg_buffer: int,
|
47
50
|
reg_buffer_sz_bytes: int,
|
48
51
|
) -> None:
|
49
|
-
|
52
|
+
custom_op.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
|
50
53
|
|
51
54
|
def dispose(fa: int) -> None:
|
52
|
-
|
55
|
+
custom_op.dispose(fa)
|
53
56
|
|
54
57
|
def meta_size() -> int:
|
55
|
-
return
|
58
|
+
return custom_op.meta_size()
|
56
59
|
|
57
60
|
def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
|
58
|
-
return
|
61
|
+
return custom_op.register_buffer(fa, ipc_tensors)
|
59
62
|
|
60
63
|
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
|
61
|
-
return
|
64
|
+
return custom_op.get_graph_buffer_ipc_meta(fa)
|
62
65
|
|
63
66
|
def register_graph_buffers(
|
64
67
|
fa: int, handles: List[List[int]], offsets: List[List[int]]
|
65
68
|
) -> None:
|
66
|
-
|
69
|
+
custom_op.register_graph_buffers(fa, handles, offsets)
|
67
70
|
|
68
71
|
else:
|
69
|
-
|
70
|
-
# ROCM custom allreduce
|
71
|
-
|
72
|
-
def init_custom_ar(
|
73
|
-
meta: torch.Tensor,
|
74
|
-
rank_data: torch.Tensor,
|
75
|
-
handles: List[str],
|
76
|
-
offsets: List[int],
|
77
|
-
rank: int,
|
78
|
-
full_nvlink: bool,
|
79
|
-
) -> int:
|
80
|
-
return sgl_kernel.allreduce.init_custom_ar(
|
81
|
-
meta, rank_data, handles, offsets, rank, full_nvlink
|
82
|
-
)
|
83
|
-
|
84
|
-
def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
85
|
-
sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
|
86
|
-
|
87
|
-
def all_reduce_unreg(
|
88
|
-
fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
|
89
|
-
) -> None:
|
90
|
-
sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
|
91
|
-
|
92
|
-
def dispose(fa: int) -> None:
|
93
|
-
sgl_kernel.allreduce.dispose(fa)
|
94
|
-
|
95
|
-
def meta_size() -> int:
|
96
|
-
return sgl_kernel.allreduce.meta_size()
|
97
|
-
|
98
|
-
def register_buffer(
|
99
|
-
fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
|
100
|
-
) -> None:
|
101
|
-
return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
|
102
|
-
|
103
|
-
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
|
104
|
-
return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
|
105
|
-
|
106
|
-
def register_graph_buffers(
|
107
|
-
fa: int, handles: List[str], offsets: List[List[int]]
|
108
|
-
) -> None:
|
109
|
-
sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
|
110
|
-
|
111
|
-
def allocate_meta_buffer(size: int) -> torch.Tensor:
|
112
|
-
return sgl_kernel.allreduce.allocate_meta_buffer(size)
|
113
|
-
|
114
|
-
def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
|
115
|
-
return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
|
72
|
+
# ROCM custom allreduce
|
116
73
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
)
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
)
|
150
|
-
|
74
|
+
def init_custom_ar(
|
75
|
+
meta: torch.Tensor,
|
76
|
+
rank_data: torch.Tensor,
|
77
|
+
handles: List[str],
|
78
|
+
offsets: List[int],
|
79
|
+
rank: int,
|
80
|
+
full_nvlink: bool,
|
81
|
+
) -> int:
|
82
|
+
return sgl_kernel.allreduce.init_custom_ar(
|
83
|
+
meta, rank_data, handles, offsets, rank, full_nvlink
|
84
|
+
)
|
85
|
+
|
86
|
+
def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
|
87
|
+
sgl_kernel.allreduce.all_reduce_reg(fa, inp, out)
|
88
|
+
|
89
|
+
def all_reduce_unreg(
|
90
|
+
fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
|
91
|
+
) -> None:
|
92
|
+
sgl_kernel.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
|
93
|
+
|
94
|
+
def dispose(fa: int) -> None:
|
95
|
+
sgl_kernel.allreduce.dispose(fa)
|
96
|
+
|
97
|
+
def meta_size() -> int:
|
98
|
+
return sgl_kernel.allreduce.meta_size()
|
99
|
+
|
100
|
+
def register_buffer(
|
101
|
+
fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
|
102
|
+
) -> None:
|
103
|
+
return sgl_kernel.allreduce.register_buffer(fa, t, handles, offsets)
|
104
|
+
|
105
|
+
def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
|
106
|
+
return sgl_kernel.allreduce.get_graph_buffer_ipc_meta(fa)
|
107
|
+
|
108
|
+
def register_graph_buffers(
|
109
|
+
fa: int, handles: List[str], offsets: List[List[int]]
|
110
|
+
) -> None:
|
111
|
+
sgl_kernel.allreduce.register_graph_buffers(fa, handles, offsets)
|
112
|
+
|
113
|
+
def allocate_meta_buffer(size: int) -> torch.Tensor:
|
114
|
+
return sgl_kernel.allreduce.allocate_meta_buffer(size)
|
115
|
+
|
116
|
+
def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
|
117
|
+
return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
|
@@ -4,7 +4,6 @@ from dataclasses import dataclass
|
|
4
4
|
from typing import Dict, List, Optional, Tuple
|
5
5
|
|
6
6
|
import torch
|
7
|
-
import torchvision.transforms as T
|
8
7
|
from PIL import Image, ImageOps
|
9
8
|
from transformers import (
|
10
9
|
AutoProcessor,
|
@@ -76,6 +75,16 @@ class ImageTransform(object):
|
|
76
75
|
self.std = std
|
77
76
|
self.normalize = normalize
|
78
77
|
|
78
|
+
# only load torchvision.transforms when needed
|
79
|
+
try:
|
80
|
+
import torchvision.transforms as T
|
81
|
+
|
82
|
+
# FIXME: add version check for gguf
|
83
|
+
except ImportError as err:
|
84
|
+
raise ImportError(
|
85
|
+
"Please install torchvision via `pip install torchvision` to use Deepseek-VL2."
|
86
|
+
) from err
|
87
|
+
|
79
88
|
transform_pipelines = [T.ToTensor()]
|
80
89
|
|
81
90
|
if normalize:
|
@@ -22,11 +22,7 @@ import torch
|
|
22
22
|
from transformers import PretrainedConfig
|
23
23
|
|
24
24
|
from sglang.srt.hf_transformers_utils import get_config, get_context_length
|
25
|
-
from sglang.srt.layers.quantization import
|
26
|
-
BASE_QUANTIZATION_METHODS,
|
27
|
-
QUANTIZATION_METHODS,
|
28
|
-
VLLM_AVAILABLE,
|
29
|
-
)
|
25
|
+
from sglang.srt.layers.quantization import QUANTIZATION_METHODS
|
30
26
|
from sglang.srt.utils import get_bool_env_var, is_hip
|
31
27
|
|
32
28
|
logger = logging.getLogger(__name__)
|
@@ -239,12 +235,7 @@ class ModelConfig:
|
|
239
235
|
|
240
236
|
# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
|
241
237
|
def _verify_quantization(self) -> None:
|
242
|
-
|
243
|
-
if VLLM_AVAILABLE:
|
244
|
-
supported_quantization = [*QUANTIZATION_METHODS]
|
245
|
-
else:
|
246
|
-
supported_quantization = [*BASE_QUANTIZATION_METHODS]
|
247
|
-
|
238
|
+
supported_quantization = [*QUANTIZATION_METHODS]
|
248
239
|
rocm_supported_quantization = [
|
249
240
|
"awq",
|
250
241
|
"gptq",
|
@@ -267,6 +258,7 @@ class ModelConfig:
|
|
267
258
|
"experts_int8",
|
268
259
|
"w8a8_int8",
|
269
260
|
"w8a8_fp8",
|
261
|
+
"moe_wna16",
|
270
262
|
]
|
271
263
|
compatible_quantization_methods = {
|
272
264
|
"w8a8_int8": ["compressed-tensors", "compressed_tensors"],
|
@@ -282,11 +274,7 @@ class ModelConfig:
|
|
282
274
|
quant_method = quant_cfg.get("quant_method", "").lower()
|
283
275
|
|
284
276
|
# Detect which checkpoint is it
|
285
|
-
|
286
|
-
available_methods = (
|
287
|
-
QUANTIZATION_METHODS if VLLM_AVAILABLE else BASE_QUANTIZATION_METHODS
|
288
|
-
)
|
289
|
-
for _, method in available_methods.items():
|
277
|
+
for _, method in QUANTIZATION_METHODS.items():
|
290
278
|
quantization_override = method.override_quantization_method(
|
291
279
|
quant_cfg, self.quantization
|
292
280
|
)
|
@@ -467,6 +455,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
|
|
467
455
|
or "InternLM2ForRewardModel" in model_architectures
|
468
456
|
or "Qwen2ForRewardModel" in model_architectures
|
469
457
|
or "Qwen2ForSequenceClassification" in model_architectures
|
458
|
+
or "CLIPModel" in model_architectures
|
470
459
|
):
|
471
460
|
return False
|
472
461
|
else:
|
@@ -488,6 +477,7 @@ multimodal_model_archs = [
|
|
488
477
|
"MllamaForConditionalGeneration",
|
489
478
|
"Qwen2VLForConditionalGeneration",
|
490
479
|
"Qwen2_5_VLForConditionalGeneration",
|
480
|
+
"CLIPModel",
|
491
481
|
]
|
492
482
|
|
493
483
|
|
@@ -169,7 +169,9 @@ class BaseGrammarBackend(ABC):
|
|
169
169
|
self.cache.clear()
|
170
170
|
|
171
171
|
|
172
|
-
def create_grammar_backend(
|
172
|
+
def create_grammar_backend(
|
173
|
+
server_args: ServerArgs, tokenizer, vocab_size: int
|
174
|
+
) -> Optional[BaseGrammarBackend]:
|
173
175
|
if server_args.grammar_backend == "outlines":
|
174
176
|
from sglang.srt.constrained.outlines_backend import OutlinesGrammarBackend
|
175
177
|
|
@@ -188,6 +190,8 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
|
|
188
190
|
tokenizer=tokenizer,
|
189
191
|
whitespace_pattern=server_args.constrained_json_whitespace_pattern,
|
190
192
|
)
|
193
|
+
elif server_args.grammar_backend == "none":
|
194
|
+
return None
|
191
195
|
else:
|
192
196
|
raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
|
193
197
|
|
sglang/srt/custom_op.py
CHANGED
@@ -50,6 +50,7 @@ if _is_cuda:
|
|
50
50
|
def scaled_fp8_quant(
|
51
51
|
input: torch.Tensor,
|
52
52
|
scale: Optional[torch.Tensor] = None,
|
53
|
+
num_token_padding: Optional[int] = None,
|
53
54
|
use_per_token_if_dynamic: bool = False,
|
54
55
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
55
56
|
"""
|
@@ -59,6 +60,8 @@ if _is_cuda:
|
|
59
60
|
input (torch.Tensor): Input tensor to be quantized
|
60
61
|
scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
|
61
62
|
If None, scales will be computed dynamically.
|
63
|
+
num_token_padding (Optional[int]): If specified, pad the first dimension
|
64
|
+
of the output to at least this value.
|
62
65
|
use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
|
63
66
|
determines the quantization granularity:
|
64
67
|
- True: compute scale per token
|
@@ -75,6 +78,8 @@ if _is_cuda:
|
|
75
78
|
assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
|
76
79
|
shape = input.shape
|
77
80
|
out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
|
81
|
+
if num_token_padding:
|
82
|
+
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
78
83
|
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
79
84
|
|
80
85
|
if scale is None:
|
@@ -5,7 +5,7 @@ import logging
|
|
5
5
|
import os
|
6
6
|
from contextlib import contextmanager
|
7
7
|
from functools import wraps
|
8
|
-
from typing import Callable, List, Optional, TypeVar, Union
|
8
|
+
from typing import Any, Callable, List, Optional, TypeVar, Union
|
9
9
|
|
10
10
|
import torch
|
11
11
|
import torch.distributed as dist
|
@@ -18,7 +18,7 @@ from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import
|
|
18
18
|
gpu_p2p_access_check,
|
19
19
|
)
|
20
20
|
from sglang.srt.distributed.parallel_state import in_the_same_node_as
|
21
|
-
from sglang.srt.utils import
|
21
|
+
from sglang.srt.utils import is_cuda, is_hip
|
22
22
|
|
23
23
|
logger = logging.getLogger(__name__)
|
24
24
|
|
@@ -217,7 +217,7 @@ class CustomAllreduce:
|
|
217
217
|
if cuda_visible_devices:
|
218
218
|
device_ids = list(map(int, cuda_visible_devices.split(",")))
|
219
219
|
else:
|
220
|
-
device_ids = list(range(
|
220
|
+
device_ids = list(range(torch.cuda.device_count()))
|
221
221
|
|
222
222
|
physical_device_id = device_ids[device.index]
|
223
223
|
tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
|
@@ -257,7 +257,7 @@ class CustomAllreduce:
|
|
257
257
|
self.world_size = world_size
|
258
258
|
self.full_nvlink = full_nvlink
|
259
259
|
|
260
|
-
if
|
260
|
+
if not _is_hip:
|
261
261
|
# Buffers memory are owned by this Python class and passed to C++.
|
262
262
|
# Meta data composes of two parts: meta data for synchronization and a
|
263
263
|
# temporary buffer for storing intermediate allreduce results.
|
@@ -280,56 +280,24 @@ class CustomAllreduce:
|
|
280
280
|
)
|
281
281
|
ops.register_buffer(self._ptr, self.buffer_ptrs)
|
282
282
|
else:
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
)
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
)
|
301
|
-
self.register_buffer(self.buffer)
|
302
|
-
self.MSCCL = os.getenv("RCCL_MSCCL_ENABLE", "1") == "1"
|
303
|
-
else:
|
304
|
-
# From TensorRT-LLM getMaxRequiredWorkspaceSize
|
305
|
-
self.max_required_workspace_size = [16 * 1024 * 1024, 8 * 1024 * 1024]
|
306
|
-
|
307
|
-
# sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
|
308
|
-
self.barrier_max_size = 8 * (36 + 2) * 8
|
309
|
-
|
310
|
-
self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
|
311
|
-
self.tmp_result_buffer_ptrs = self.create_shared_buffer(
|
312
|
-
max_size, group=group
|
313
|
-
)
|
314
|
-
self.rank_data_base = torch.empty(
|
315
|
-
8 * 1024 * 1024, dtype=torch.uint8, device=self.device
|
316
|
-
)
|
317
|
-
self.barrier_in_ptrs = self.create_shared_buffer(
|
318
|
-
self.barrier_max_size, group=group
|
319
|
-
)
|
320
|
-
self.barrier_out_ptrs = self.create_shared_buffer(
|
321
|
-
self.barrier_max_size, group=group
|
322
|
-
)
|
283
|
+
# meta data buffers need to be "uncached" for signal on MI200
|
284
|
+
self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
|
285
|
+
self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
|
286
|
+
handle = ops.get_meta_buffer_ipc_handle(self.meta)
|
287
|
+
shard_data = (
|
288
|
+
bytes(handle), # ipc handle to base ptr
|
289
|
+
0, # offset of base ptr
|
290
|
+
)
|
291
|
+
handles, offsets = self._gather_ipc_meta(shard_data)
|
292
|
+
self.rank_data = torch.empty(
|
293
|
+
8 * 1024 * 1024, dtype=torch.uint8, device=self.device
|
294
|
+
)
|
295
|
+
self._ptr = ops.init_custom_ar(
|
296
|
+
self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
|
297
|
+
)
|
298
|
+
self.register_buffer(self.buffer)
|
299
|
+
self.MSCCL = os.getenv("RCCL_MSCCL_ENABLE", "1") == "1"
|
323
300
|
|
324
|
-
self._ptr = ops.init_custom_ar(
|
325
|
-
rank,
|
326
|
-
world_size,
|
327
|
-
self.rank_data_base,
|
328
|
-
self.buffer_ptrs,
|
329
|
-
self.tmp_result_buffer_ptrs,
|
330
|
-
self.barrier_in_ptrs,
|
331
|
-
self.barrier_out_ptrs,
|
332
|
-
)
|
333
301
|
self.disabled = False
|
334
302
|
|
335
303
|
@staticmethod
|
@@ -455,7 +423,7 @@ class CustomAllreduce:
|
|
455
423
|
return False
|
456
424
|
# for 4 or more non NVLink-capable GPUs, custom allreduce provides
|
457
425
|
# little performance improvement over NCCL.
|
458
|
-
if
|
426
|
+
if not _is_hip:
|
459
427
|
if self.world_size == 2 or self.full_nvlink:
|
460
428
|
return inp_size < self.max_size
|
461
429
|
return False
|
@@ -471,18 +439,6 @@ class CustomAllreduce:
|
|
471
439
|
return inp_size < self.max_size
|
472
440
|
return False
|
473
441
|
|
474
|
-
if self.world_size == 2:
|
475
|
-
return (
|
476
|
-
inp_size < self.max_size
|
477
|
-
and inp_size < self.max_required_workspace_size[0]
|
478
|
-
)
|
479
|
-
|
480
|
-
if self.full_nvlink:
|
481
|
-
return (
|
482
|
-
inp_size < self.max_size
|
483
|
-
and inp_size < self.max_required_workspace_size[1]
|
484
|
-
)
|
485
|
-
|
486
442
|
return False
|
487
443
|
|
488
444
|
# all reduce, assuming inp tensor is IPC registered with register_buffer,
|
@@ -515,15 +471,12 @@ class CustomAllreduce:
|
|
515
471
|
"""
|
516
472
|
if out is None:
|
517
473
|
out = torch.empty_like(inp)
|
518
|
-
if
|
519
|
-
|
520
|
-
ops.all_reduce(self._ptr, inp, out, 0, 0)
|
521
|
-
else:
|
522
|
-
ops.all_reduce(
|
523
|
-
self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
|
524
|
-
)
|
474
|
+
if registered:
|
475
|
+
ops.all_reduce(self._ptr, inp, out, 0, 0)
|
525
476
|
else:
|
526
|
-
ops.all_reduce(
|
477
|
+
ops.all_reduce(
|
478
|
+
self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
|
479
|
+
)
|
527
480
|
return out
|
528
481
|
|
529
482
|
def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
|
@@ -554,14 +507,9 @@ class CustomAllreduce:
|
|
554
507
|
def close(self):
|
555
508
|
if not self.disabled and self._ptr:
|
556
509
|
ops.dispose(self._ptr)
|
557
|
-
if
|
510
|
+
if _is_cuda:
|
558
511
|
self.free_shared_buffer(self.meta_ptrs)
|
559
512
|
self.free_shared_buffer(self.buffer_ptrs)
|
560
|
-
elif _is_cuda:
|
561
|
-
self.free_shared_buffer(self.buffer_ptrs)
|
562
|
-
self.free_shared_buffer(self.tmp_result_buffer_ptrs)
|
563
|
-
self.free_shared_buffer(self.barrier_in_ptrs)
|
564
|
-
self.free_shared_buffer(self.barrier_out_ptrs)
|
565
513
|
self._ptr = 0
|
566
514
|
|
567
515
|
def __del__(self):
|